# Test Hastags

In [10]:
bbc_news_hashtags = {
    "news1": [
        "#DonaldTrump",
        "#DanaWhite",
        "#UFC",
        "#RNC",
        "#MMA",
        "#Republican",
        "#JohnMcCain",
        "#TrumpTajMahal",
        "#AtlanticCity",
        "#NewJersey",
        "#USSenator"
    ],
    "news2": [
        "#Fangirls",
        "#HarryStyles",
        "#Swifties",
        "#TaylorSwift",
        "#ErasTour",
        "#YveBlake",
        "#OneDirection",
        "#ZaynMalik",
        "#Musical",
        "#SydneyOperaHouse",
        "#LyricTheatre",
        "#Hammersmith",
        "#JasmineElcock",
        "#BritainsGotTalent"
    ],
    "news3" : [
    "#GreyhoundRacing",
    "#AnimalCruelty",
    "#Whistleblower",
    "#Australia",
    "#GRNSW",
    "#NSW",
    "#AlexBrittan",
    "#AnimalWelfare",
    "#DogRacing",
    "#Injustice",
    "#Inquiry",
    "#DavidHarris",
    "#ChrisMinns",
    "#EmmaHurst",
    "#Sydney"
],
}
guardian_news_hashtags = {
    "news1": [
    "#GreyhoundRacing",
    "#AnimalWelfare",
    "#AnimalAbuse",
    "#VeterinaryReport",
    "#NSWInquiry",
    "#PublicHearings",
    "#WhistleblowerProtection",
    "#ChrisMinns",
    "#RacingIndustry",
    "#GamblingIndustry"
]
}

cnn_news_hashtags = {
    "news1": [
        "#SpaceExploration",
        "#MarsMission",
        "#NASA",
        "#Astronauts",
        "#SpaceStation",
        "#InterstellarTravel",
        "#Galaxies",
        "#Astrobiology",
        "#SpaceTechnology",
        "#RocketScience",
        "#CosmicDiscovery"
    ],
    "news2": [
        "#ClimateChange",
        "#SustainableLiving",
        "#GreenEnergy",
        "#RenewableResources",
        "#EcoFriendly",
        "#CarbonFootprint",
        "#ClimateAction",
        "#GlobalWarming",
        "#EnvironmentalPolicy",
        "#CleanEnergy",
        "#EarthDay"
    ],
    "news3": [
        "#DigitalNomads",
        "#RemoteWork",
        "#FreelanceLife",
        "#WorkFromAnywhere",
        "#OnlineBusiness",
        "#VirtualTeams",
        "#TravelAndWork",
        "#GigEconomy",
        "#DigitalMarketing",
        "#HomeOffice",
        "#WorkLifeBalance"
    ]
}


## Imports

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

## Define Model

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')




## PreProcess

In [11]:

# Preprocess hashtags into strings
def preprocess_hashtags(hashtags):
    return ' '.join([hashtag.lower() for hashtag in hashtags])

# Generate embeddings for each set of hashtags
bbc_news_texts = {key: preprocess_hashtags(value) for key, value in bbc_news_hashtags.items()}
guardian_news_texts = {key: preprocess_hashtags(value) for key, value in guardian_news_hashtags.items()}
cnn_news_texts = {key: preprocess_hashtags(value) for key, value in cnn_news_hashtags.items()}

bbc_embeddings = {key: model.encode(text) for key, text in bbc_news_texts.items()}
guardian_embeddings = {key: model.encode(text) for key, text in guardian_news_texts.items()}
cnn_embeddings = {key: model.encode(text) for key, text in cnn_news_texts.items()}



## Calculate similarity

In [12]:

# Calculate similarity scores between BBC, Guardian, and CNN news
similarity_scores = {}
for bbc_key, bbc_embed in bbc_embeddings.items():
    for guardian_key, guardian_embed in guardian_embeddings.items():
        score = cosine_similarity([bbc_embed], [guardian_embed])[0][0]
        similarity_scores[(bbc_key, guardian_key, 'Guardian')] = score
    
    for cnn_key, cnn_embed in cnn_embeddings.items():
        score = cosine_similarity([bbc_embed], [cnn_embed])[0][0]
        similarity_scores[(bbc_key, cnn_key, 'CNN')] = score

for guardian_key, guardian_embed in guardian_embeddings.items():
    for cnn_key, cnn_embed in cnn_embeddings.items():
        score = cosine_similarity([guardian_embed], [cnn_embed])[0][0]
        similarity_scores[(guardian_key, cnn_key, 'CNN')] = score

# Print the similarity scores
for (news1_key, news2_key, source), score in similarity_scores.items():
    print(f"Similarity between {source} {news1_key} and {source} {news2_key}: {score:.2f}")


Similarity between Guardian news1 and Guardian news1: 0.35
Similarity between CNN news1 and CNN news1: 0.36
Similarity between CNN news1 and CNN news2: 0.30
Similarity between CNN news1 and CNN news3: 0.30
Similarity between Guardian news2 and Guardian news1: 0.36
Similarity between CNN news2 and CNN news1: 1.00
Similarity between CNN news2 and CNN news2: 0.26
Similarity between CNN news2 and CNN news3: 0.23
Similarity between Guardian news3 and Guardian news1: 0.75
Similarity between CNN news3 and CNN news1: 0.41
Similarity between CNN news3 and CNN news2: 0.36
Similarity between CNN news3 and CNN news3: 0.30
