In [25]:
bbc_news_hashtags = {
    "news1": [
        "#DonaldTrump",
        "#DanaWhite",
        "#UFC",
        "#RNC",
        "#MMA",
        "#Republican",
        "#JohnMcCain",
        "#TrumpTajMahal",
        "#AtlanticCity",
        "#NewJersey",
        "#USSenator"
    ],
    "news2": [
        "#Fangirls",
        "#HarryStyles",
        "#Swifties",
        "#TaylorSwift",
        "#ErasTour",
        "#YveBlake",
        "#OneDirection",
        "#ZaynMalik",
        "#Musical",
        "#SydneyOperaHouse",
        "#LyricTheatre",
        "#Hammersmith",
        "#JasmineElcock",
        "#BritainsGotTalent"
    ],
    "news3" : [
    "#GreyhoundRacing",
    "#AnimalCruelty",
    "#Whistleblower",
    "#Australia",
    "#GRNSW",
    "#NSW",
    "#AlexBrittan",
    "#AnimalWelfare",
    "#DogRacing",
    "#Injustice",
    "#Inquiry",
    "#DavidHarris",
    "#ChrisMinns",
    "#EmmaHurst",
    "#Sydney"
],
}
guardian_news_hashtags = {
    "news1": [ 
    "#GreyhoundRacing",
    "#AnimalWelfare",
    "#AnimalAbuse",
    "#VeterinaryReport",
    "#NSWInquiry",
    "#PublicHearings",
    "#WhistleblowerProtection",
    "#ChrisMinns",
    "#RacingIndustry",
    "#GamblingIndustry"
]}

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Preprocess hashtags into strings
def preprocess_hashtags(hashtags):
    return ' '.join([hashtag.lower() for hashtag in hashtags])

bbc_news_texts = {key: preprocess_hashtags(value) for key, value in bbc_news_hashtags.items()}
guardian_news_texts = {key: preprocess_hashtags(value) for key, value in guardian_news_hashtags.items()}

# Generate embeddings
bbc_embeddings = {key: model.encode(text) for key, text in bbc_news_texts.items()}
guardian_embeddings = {key: model.encode(text) for key, text in guardian_news_texts.items()}

# Calculate similarity
similarity_scores = {}
for bbc_key, bbc_embed in bbc_embeddings.items():
    for guardian_key, guardian_embed in guardian_embeddings.items():
        score = cosine_similarity([bbc_embed], [guardian_embed])[0][0]
        similarity_scores[(bbc_key, guardian_key)] = score

# Print similarity scores
for (bbc_key, guardian_key), score in similarity_scores.items():
    print(f"Similarity between BBC {bbc_key} and Guardian {guardian_key}: {score:.2f}")




Similarity between BBC news1 and Guardian news1: 0.40
Similarity between BBC news2 and Guardian news1: 0.25
Similarity between BBC news3 and Guardian news1: 0.57
