In [1]:
from transformers import pipeline
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import yaml
import json
import random
import re
import ast

TEST_SIZE = 10

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def avg(inp):
    return sum(inp) / len(inp)

    
def get_hypothesis_pred(output):
    match = re.search(r"Final answer: the (first|second) tweet", output)
    if match:
        result = match.group(1)
        return result
    else:
        return None

def get_hypothesis(output):
    match = re.search(r'HYP:\s*(.*?[\.\!\?])', output)
    if match:
        result = match.group(1)
        return result
    else:
        return None

def compute_score(preds, labels):
    correct = sum([t == p for t, p in zip(labels, preds)])
    score = correct / len(labels)
    return score

# Randomly sample k examples from the dataset for few-shot generation, used for prompting
def get_random_batch(dataset, k=10):
    sample = random.sample(list(dataset), k)
    return [(s['first_tweet'], s['second_tweet'], s['result']) for s in sample]


## 1 load test model

In [3]:
open_r1_path = "../open-r1-hypogen/"
policy_model = pipeline("text-generation", model=open_r1_path + "data/Qwen2.5-1.5B-hypogen-GRPO-wexamples")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Device set to use cuda:0


## 2 generate 10 hypothesis 

In [4]:
with open(open_r1_path+"recipes/hypoGen/config_demo.yaml", "r", encoding="utf-8") as file:
    data = yaml.safe_load(file)
with open(open_r1_path+'/recipes/hypoGen/hypothesis_bank.json', 'r') as f:
    hb = json.load(f)
hypothesis_bank = hb['hypothesis']
data['generation_mode'] = "tweet_based"

In [5]:

dataset = load_dataset("Dudep/retweet_all")
llm_inputs = []

for i in range(TEST_SIZE):
    if data['generation_mode'] == "zero_shot" and data['user_prompt_zero_shot'] is not None:
        llm_inputs.append(data['user_prompt_zero_shot'])


    elif data['generation_mode'] == "few_shot" and data['user_prompt_few_shot'] is not None:
        # Get a random batch of 4 example hypotheses from the hypothesis bank
        few_shot = random.sample(hypothesis_bank, 4)
        # Remove the index of the hypotheses
        few_shot = [h.split(". ", 1)[1] for h in few_shot]
        # Format them into a single string
        few_shot_text = "\n".join([f"- HYP: {h}" for h in few_shot])
        # Add the examples to the llm_inputs
        user_prompt = data['user_prompt_few_shot'].format(few_shot_examples=few_shot_text)
        llm_inputs.append(user_prompt)


    elif data['generation_mode'] == "tweet_based" and data['user_prompt_tweet_based'] is not None:
        # Get a random batch of 10 tweet pairs from the dataset
        tweet_examples = get_random_batch(dataset['train'], k=10)
        # Format them into a single string
        tweet_examples_text = "\n".join([f"(- tweet 1: {t[0]}\n- tweet 2: {t[1]}\n- label: {t[2]})" for t in tweet_examples])
        
        # Get a random batch of 4 example hypotheses from the hypothesis bank
        few_shot = random.sample(hypothesis_bank, 4)
        # Remove the index of the hypotheses
        few_shot = [h.split(". ", 1)[1] for h in few_shot]
        # Format them into a single string
        few_shot_text = "\n".join([f"- HYP: {h}" for h in few_shot])

        # Add the examples to the llm_inputs
        user_prompt = data['user_prompt_tweet_based'].format(
            tweet_examples=tweet_examples_text
        )
        llm_inputs.append(user_prompt)

In [6]:
hyp_result = []
for inp in llm_inputs:
    hyp = None
    while hyp is None:
        output = policy_model(inp, 
                            max_new_tokens=1024, 
                            num_return_sequences=1,
                            temperature=0.7, 
                            do_sample=True,
                            return_full_text=False,
                            )
        #print(output[0]['generated_text'])
        hyp = get_hypothesis(output[0]['generated_text'].split('## Output')[-1])
    print(hyp)
    hyp_result.append(hyp)

Retweets occur more often if there is an agreement between the tweeters about the relevance of their content.
The labeled tweet is more likely to be retweeted due to its relevance to current events and issues, whereas the unlabeled tweet focuses on fictional content within a specific context.
Retweets are influenced by relevance and importance of content; labeled tweets provide additional information or context that may encourage further engagement and sharing.
Retweets are influenced by content quality.
Retweets are influenced by the content quality of the tweets.
Hypothetically speaking, the labelled tweet has a higher chance of being retweeted because it is tagged with an explanation or context about its content, making it easier for followers to understand and appreciate the message.
The labelled tweet is retweeted more likely because it provides additional context or information that makes it more interesting or relevant to the audience.
Retweets are influenced by relevance and im

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Retweets are influenced by the presence of visuals or external links within a tweet.
Retweets are influenced by the quality and relevance of the content, regardless of the label.


## 3 evaluate these hypothesis


#### 3.1 Practiality Score Comparison

In [7]:
infer_model = pipeline("text-generation", model="Qwen/Qwen2.5-3B-Instruct")
with open('../data/retweet/retweet_val.json', 'r') as f:
    tweet_pairs_dataset = json.load(f)
tweet_pairs_dataset['input'] = [(first, second) for first, second in zip(tweet_pairs_dataset['first_tweet'], tweet_pairs_dataset['second_tweet'])]

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.64it/s]
Device set to use cuda:0


In [8]:
def get_practical_rewards(hyp_result, batch_size=10):
    with open(open_r1_path+'/recipes/hypoGen/hypothesis_infer.md', 'r') as f:
        infer_template = f.read()

    practical_rewards = []
    for hyp in hyp_result:
        rewards = []
        print(f'testing :{hyp}\n')
        for i in range(0, len(tweet_pairs_dataset['input']), batch_size):
            llm_input = tweet_pairs_dataset['input'][i:i+batch_size]
            labels = tweet_pairs_dataset['label'][i:i+batch_size]
            if not hyp:
                rewards.append(0)

            else:
                final_prompt = infer_template.format(input=llm_input, hypothesis=hyp)
                output = infer_model(final_prompt, 
                                    max_new_tokens=100, 
                                    num_return_sequences=1,
                                    temperature=0.3, 
                                    do_sample=True)
                output = output[0]['generated_text'].split('## OUTPUT')[-1]
                
                match = re.search(r'\[[^\[\]]*\]', output)
                if match:
                    preds = match.group(0)
                    preds = ast.literal_eval(preds)
                    score = compute_score(preds, labels)
                    rewards.append(score)
                else:
                    rewards.append(0)
        avg_rew = sum(rewards)/len(rewards)
        print(f'avg practical rewards: {avg_rew}')
        practical_rewards.append(avg_rew)
    return practical_rewards

In [9]:
practical_rewards = get_practical_rewards(hyp_result, 10)

testing :Retweets occur more often if there is an agreement between the tweeters about the relevance of their content.

avg practical rewards: 0.4499999999999999
testing :The labeled tweet is more likely to be retweeted due to its relevance to current events and issues, whereas the unlabeled tweet focuses on fictional content within a specific context.

avg practical rewards: 0.41500000000000004
testing :Retweets are influenced by relevance and importance of content; labeled tweets provide additional information or context that may encourage further engagement and sharing.

avg practical rewards: 0.45499999999999996
testing :Retweets are influenced by content quality.

avg practical rewards: 0.45
testing :Retweets are influenced by the content quality of the tweets.

avg practical rewards: 0.4950000000000001
testing :Hypothetically speaking, the labelled tweet has a higher chance of being retweeted because it is tagged with an explanation or context about its content, making it easier 

# zero-shot

testing :HYP: Users who describe their days as 'tough' are more likely to post positive tweets than those who describe them as 'long'.

avg practical rewards: 0.43
testing :Users often use slightly different wording to emphasize or change the tone of similar information, such as highlighting time sensitivity or emotional impact without altering the fundamental meaning.

avg practical rewards: 0.45999999999999996
testing :The tweets have a similar sentiment, but the wording differs slightly.

avg practical rewards: 0.4699999999999999
testing :HYP: The number of words in each tweet pair is the same, but there are more negative words in the first tweet compared to the second one.

avg practical rewards: 0.42000000000000004
testing :HYP: The difference in wording often indicates an increase or decrease in sentiment, but the specific magnitude of change varies based on context.

avg practical rewards: 0.4000000000000001
testing :HYP: The more positive words used, the higher the likelihood of a negative tweet being followed by another negative tweet.

avg practical rewards: 0.45999999999999996
testing :HYP: The sentiment expressed in the tweet is generally positive but varies based on the inclusion or omission of certain positive words or phrases.

avg practical rewards: 0.395
testing :HYP: The longer a tweet is, the more likely it will be retweeted.

avg practical rewards: 0.41
testing :HYP: HYP: The wording difference reflects varying degrees of emotional intensity used to convey excitement.

avg practical rewards: 0.45500000000000007
testing :HYP: If a tweet includes a specific hashtag or mentions a particular event, then the sentiment of the tweet tends to be more positive than those without this information.

avg practical rewards: 0.41

# few-shot

testing :Tweets containing hashtags are likely to receive more retweets because they help establish credibility through social proof and attract attention due to their visibility on popular platforms like Twitter.

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
avg practical rewards: 0.41000000000000003
testing :** Tweets with emotive language, such as personal experiences or emotions, tend to elicit stronger reactions due to their relatability and authenticity.

avg practical rewards: 0.445
testing :Tweets featuring images or GIFs tend to receive more favorites due to their ability to visually convey information effectively and engage users who may not be immediately interested in reading extensive textual descriptions.

avg practical rewards: 0.31999999999999995
testing :Tweets that include a clear call to action, such as "Join us in #SaveOurPlanet!

avg practical rewards: 0.365
testing :Emotional language tends to increase engagement in social media posts because it taps into universal emotional responses, potentially drawing attention and fostering a sense of community among users who identify with shared emotions.

avg practical rewards: 0.43499999999999994
testing :** Tweets expressing strong emotions, especially those involving urgency or personal relevance, are more likely to be liked due to their ability to engage readers on a deeper level, potentially influencing them to share or engage further.

avg practical rewards: 0.46499999999999997
testing :Tweets containing emotional language or tone, especially when expressing positivity or empathy, are likely to receive higher engagement because they resonate emotionally with readers.

avg practical rewards: 0.41500000000000004
testing :Tweets with personal stories tend to elicit greater emotional resonance and thus lead to higher levels of engagement, including likes, retweets, and shares, due to the relatability and authenticity of the narrative presented in the content.

avg practical rewards: 0.485
testing :Tweets containing emotional language tend to receive higher numbers of likes due to their ability to connect with readers emotionally.

avg practical rewards: 0.43499999999999994
testing :Hashtags significantly impact the likelihood of a tweet being retweeted, suggesting they serve as identifiers for relevant topics or trends within a given context.

avg practical rewards: 0.425

# with examples

testing :Retweets occur more often if there is an agreement between the tweeters about the relevance of their content.

avg practical rewards: 0.4499999999999999
testing :The labeled tweet is more likely to be retweeted due to its relevance to current events and issues, whereas the unlabeled tweet focuses on fictional content within a specific context.

avg practical rewards: 0.41500000000000004
testing :Retweets are influenced by relevance and importance of content; labeled tweets provide additional information or context that may encourage further engagement and sharing.

avg practical rewards: 0.45499999999999996
testing :Retweets are influenced by content quality.

avg practical rewards: 0.45
testing :Retweets are influenced by the content quality of the tweets.

avg practical rewards: 0.4950000000000001
testing :Hypothetically speaking, the labelled tweet has a higher chance of being retweeted because it is tagged with an explanation or context about its content, making it easier for followers to understand and appreciate the message.

avg practical rewards: 0.445
testing :The labelled tweet is retweeted more likely because it provides additional context or information that makes it more interesting or relevant to the audience.

avg practical rewards: 0.475
testing :Retweets are influenced by relevance and importance of the content, regardless of whether the content is labeled or not.

avg practical rewards: 0.45999999999999996
testing :Retweets are influenced by the presence of visuals or external links within a tweet.

avg practical rewards: 0.32499999999999996
testing :Retweets are influenced by the quality and relevance of the content, regardless of the label.

avg practical rewards: 0.485

#### 3.2 Novelty Score Comparison

In [10]:
from sentence_transformers import SentenceTransformer, util
emb_model = SentenceTransformer('all-MiniLM-L6-v2')
emb_hyp_bank = emb_model.encode(hypothesis_bank, convert_to_tensor=True)

def get_novelty_rewards(hyp_result):
    rewards = []
    for hyp in hyp_result:
        if not hyp:
            rewards.append(0)
        else:
            emb_completion = emb_model.encode(hyp, convert_to_tensor=True)
            cosine_rewards = util.cos_sim(emb_completion, emb_hyp_bank)
            mean_similarity = cosine_rewards.mean().item()
            novelty = 1 - mean_similarity
            rewards.append(novelty)

    return rewards

In [11]:
novelty_rewards = get_novelty_rewards(hyp_result)
for i in range(len(novelty_rewards)):
    novelty_rewards[i] = novelty_rewards[i]
    print(f'Hypothesis: {hyp_result[i]}, Novelty: {novelty_rewards[i]}\n')
print(f'Mean Novelty: {avg(novelty_rewards)}\n')

Hypothesis: Retweets occur more often if there is an agreement between the tweeters about the relevance of their content., Novelty: 0.5841473639011383

Hypothesis: The labeled tweet is more likely to be retweeted due to its relevance to current events and issues, whereas the unlabeled tweet focuses on fictional content within a specific context., Novelty: 0.635564923286438

Hypothesis: Retweets are influenced by relevance and importance of content; labeled tweets provide additional information or context that may encourage further engagement and sharing., Novelty: 0.5922483503818512

Hypothesis: Retweets are influenced by content quality., Novelty: 0.5720847547054291

Hypothesis: Retweets are influenced by the content quality of the tweets., Novelty: 0.5571405291557312

Hypothesis: Hypothetically speaking, the labelled tweet has a higher chance of being retweeted because it is tagged with an explanation or context about its content, making it easier for followers to understand and appr

# zero-shot

Hypothesis: HYP: Users who describe their days as 'tough' are more likely to post positive tweets than those who describe them as 'long'., Novelty: 0.6143368780612946

Hypothesis: Users often use slightly different wording to emphasize or change the tone of similar information, such as highlighting time sensitivity or emotional impact without altering the fundamental meaning., Novelty: 0.8130578994750977

Hypothesis: The tweets have a similar sentiment, but the wording differs slightly., Novelty: 0.5290723741054535

Hypothesis: HYP: The number of words in each tweet pair is the same, but there are more negative words in the first tweet compared to the second one., Novelty: 0.5707627236843109

Hypothesis: HYP: The difference in wording often indicates an increase or decrease in sentiment, but the specific magnitude of change varies based on context., Novelty: 0.7462144196033478

Hypothesis: HYP: The more positive words used, the higher the likelihood of a negative tweet being followed by another negative tweet., Novelty: 0.5801884531974792

Hypothesis: HYP: The sentiment expressed in the tweet is generally positive but varies based on the inclusion or omission of certain positive words or phrases., Novelty: 0.6465462148189545

Hypothesis: HYP: The longer a tweet is, the more likely it will be retweeted., Novelty: 0.5442407131195068

Hypothesis: HYP: HYP: The wording difference reflects varying degrees of emotional intensity used to convey excitement., Novelty: 0.836878091096878

Hypothesis: HYP: If a tweet includes a specific hashtag or mentions a particular event, then the sentiment of the tweet tends to be more positive than those without this information., Novelty: 0.5831388533115387

Mean Novelty: 0.6464436620473861

# few-shot

Hypothesis: Tweets containing hashtags are likely to receive more retweets because they help establish credibility through social proof and attract attention due to their visibility on popular platforms like Twitter., Novelty: 0.52156201004982

Hypothesis: ** Tweets with emotive language, such as personal experiences or emotions, tend to elicit stronger reactions due to their relatability and authenticity., Novelty: 0.517014354467392

Hypothesis: Tweets featuring images or GIFs tend to receive more favorites due to their ability to visually convey information effectively and engage users who may not be immediately interested in reading extensive textual descriptions., Novelty: 0.4922412633895874

Hypothesis: Tweets that include a clear call to action, such as "Join us in #SaveOurPlanet!, Novelty: 0.6062251925468445

Hypothesis: Emotional language tends to increase engagement in social media posts because it taps into universal emotional responses, potentially drawing attention and fostering a sense of community among users who identify with shared emotions., Novelty: 0.6597398221492767

Hypothesis: ** Tweets expressing strong emotions, especially those involving urgency or personal relevance, are more likely to be liked due to their ability to engage readers on a deeper level, potentially influencing them to share or engage further., Novelty: 0.44226980209350586

Hypothesis: Tweets containing emotional language or tone, especially when expressing positivity or empathy, are likely to receive higher engagement because they resonate emotionally with readers., Novelty: 0.5605274140834808

Hypothesis: Tweets with personal stories tend to elicit greater emotional resonance and thus lead to higher levels of engagement, including likes, retweets, and shares, due to the relatability and authenticity of the narrative presented in the content., Novelty: 0.5020719468593597

Hypothesis: Tweets containing emotional language tend to receive higher numbers of likes due to their ability to connect with readers emotionally., Novelty: 0.44571882486343384

Hypothesis: Hashtags significantly impact the likelihood of a tweet being retweeted, suggesting they serve as identifiers for relevant topics or trends within a given context., Novelty: 0.5610295832157135

Mean Novelty: 0.5308400213718414

# with examples

Hypothesis: Retweets occur more often if there is an agreement between the tweeters about the relevance of their content., Novelty: 0.5841473639011383

Hypothesis: The labeled tweet is more likely to be retweeted due to its relevance to current events and issues, whereas the unlabeled tweet focuses on fictional content within a specific context., Novelty: 0.635564923286438

Hypothesis: Retweets are influenced by relevance and importance of content; labeled tweets provide additional information or context that may encourage further engagement and sharing., Novelty: 0.5922483503818512

Hypothesis: Retweets are influenced by content quality., Novelty: 0.5720847547054291

Hypothesis: Retweets are influenced by the content quality of the tweets., Novelty: 0.5571405291557312

Hypothesis: Hypothetically speaking, the labelled tweet has a higher chance of being retweeted because it is tagged with an explanation or context about its content, making it easier for followers to understand and appreciate the message., Novelty: 0.6296382546424866

Hypothesis: The labelled tweet is retweeted more likely because it provides additional context or information that makes it more interesting or relevant to the audience., Novelty: 0.6048772633075714

Hypothesis: Retweets are influenced by relevance and importance of the content, regardless of whether the content is labeled or not., Novelty: 0.6947284638881683

Hypothesis: Retweets are influenced by the presence of visuals or external links within a tweet., Novelty: 0.6073979735374451

Hypothesis: Retweets are influenced by the quality and relevance of the content, regardless of the label., Novelty: 0.6854727268218994

Mean Novelty: 0.6163300603628159


#### 3.3 Soundness Score Comparison

In [12]:
def soundness_rewards(hyp_result):
    with open(open_r1_path+'/recipes/hypoGen/judge_model.md', 'r') as f:
        infer_template = f.read()
    rewards = []
    for hyp in hyp_result:
        if not hyp:
            rewards.append(0)
        else:
            max_retries = 3
            retry_count = 0
            output = infer_model(infer_template.format(hypothesis = hyp), #prompt, 
                                max_new_tokens=100, 
                                num_return_sequences=1,
                                temperature=0.3, 
                                do_sample=True)
            output = output[0]['generated_text'].split('## OUTPUT')[-1]
    
            # Extract scores with error handling
            #match = re.search(r'\[[^\[\]]*\]', output)
            match = re.search(r'Score:\s*([0-9]+(?:\.[0-9]+)?)', output)
            if not match:
                while retry_count < max_retries:
                    retry_count += 1
                    output = infer_model(infer_template.format(hypothesis = hyp), #prompt,
                                        max_new_tokens=100, 
                                        num_return_sequences=1,
                                        temperature=0.3, 
                                        do_sample=True)
                    output = output[0]['generated_text'].split('## OUTPUT')[-1]
                    match = re.search(r'\[[^\[\]]*\]', output)
                    if match:
                        break
            
            if not match:
                rewards.append(0)
            else: 
                rewards.append(0.1*float(match.group(1))) # to [0, 1]
            
    rewards = [max(0.0, min(1.0, score)) for score in rewards]
            
    return rewards

In [13]:
soundness_r = soundness_rewards(hyp_result)
for i in range(len(soundness_r)):
    soundness_r[i] = soundness_r[i]
    print(f'Hypothesis: {hyp_result[i]}, Soundness: {soundness_r[i]}\n')
print(f'Mean Soundness: {avg(soundness_r)} \n')

Hypothesis: Retweets occur more often if there is an agreement between the tweeters about the relevance of their content., Soundness: 0.8

Hypothesis: The labeled tweet is more likely to be retweeted due to its relevance to current events and issues, whereas the unlabeled tweet focuses on fictional content within a specific context., Soundness: 0.8

Hypothesis: Retweets are influenced by relevance and importance of content; labeled tweets provide additional information or context that may encourage further engagement and sharing., Soundness: 0.9

Hypothesis: Retweets are influenced by content quality., Soundness: 0.8

Hypothesis: Retweets are influenced by the content quality of the tweets., Soundness: 0.8

Hypothesis: Hypothetically speaking, the labelled tweet has a higher chance of being retweeted because it is tagged with an explanation or context about its content, making it easier for followers to understand and appreciate the message., Soundness: 0.8

Hypothesis: The labelled tw

# zero-shot

Hypothesis: HYP: Users who describe their days as 'tough' are more likely to post positive tweets than those who describe them as 'long'., Soundness: 0.4

Hypothesis: Users often use slightly different wording to emphasize or change the tone of similar information, such as highlighting time sensitivity or emotional impact without altering the fundamental meaning., Soundness: 0.9

Hypothesis: The tweets have a similar sentiment, but the wording differs slightly., Soundness: 0.8

Hypothesis: HYP: The number of words in each tweet pair is the same, but there are more negative words in the first tweet compared to the second one., Soundness: 0.6000000000000001

Hypothesis: HYP: The difference in wording often indicates an increase or decrease in sentiment, but the specific magnitude of change varies based on context., Soundness: 0.8

Hypothesis: HYP: The more positive words used, the higher the likelihood of a negative tweet being followed by another negative tweet., Soundness: 0.30000000000000004

Hypothesis: HYP: The sentiment expressed in the tweet is generally positive but varies based on the inclusion or omission of certain positive words or phrases., Soundness: 0.8

Hypothesis: HYP: The longer a tweet is, the more likely it will be retweeted., Soundness: 0.5

Hypothesis: HYP: HYP: The wording difference reflects varying degrees of emotional intensity used to convey excitement., Soundness: 0.8

Hypothesis: HYP: If a tweet includes a specific hashtag or mentions a particular event, then the sentiment of the tweet tends to be more positive than those without this information., Soundness: 0.8

Mean Soundness: 0.6699999999999999

# few-shot

Hypothesis: Tweets containing hashtags are likely to receive more retweets because they help establish credibility through social proof and attract attention due to their visibility on popular platforms like Twitter., Soundness: 0.9

Hypothesis: ** Tweets with emotive language, such as personal experiences or emotions, tend to elicit stronger reactions due to their relatability and authenticity., Soundness: 0.9

Hypothesis: Tweets featuring images or GIFs tend to receive more favorites due to their ability to visually convey information effectively and engage users who may not be immediately interested in reading extensive textual descriptions., Soundness: 0.9

Hypothesis: Tweets that include a clear call to action, such as "Join us in #SaveOurPlanet!, Soundness: 0.9

Hypothesis: Emotional language tends to increase engagement in social media posts because it taps into universal emotional responses, potentially drawing attention and fostering a sense of community among users who identify with shared emotions., Soundness: 0.9

Hypothesis: ** Tweets expressing strong emotions, especially those involving urgency or personal relevance, are more likely to be liked due to their ability to engage readers on a deeper level, potentially influencing them to share or engage further., Soundness: 0.9

Hypothesis: Tweets containing emotional language or tone, especially when expressing positivity or empathy, are likely to receive higher engagement because they resonate emotionally with readers., Soundness: 0.9

Hypothesis: Tweets with personal stories tend to elicit greater emotional resonance and thus lead to higher levels of engagement, including likes, retweets, and shares, due to the relatability and authenticity of the narrative presented in the content., Soundness: 0.9

Hypothesis: Tweets containing emotional language tend to receive higher numbers of likes due to their ability to connect with readers emotionally., Soundness: 0.8

Hypothesis: Hashtags significantly impact the likelihood of a tweet being retweeted, suggesting they serve as identifiers for relevant topics or trends within a given context., Soundness: 0.9

Mean Soundness: 0.8900000000000002

# with examples

Hypothesis: Retweets occur more often if there is an agreement between the tweeters about the relevance of their content., Soundness: 0.8

Hypothesis: The labeled tweet is more likely to be retweeted due to its relevance to current events and issues, whereas the unlabeled tweet focuses on fictional content within a specific context., Soundness: 0.8

Hypothesis: Retweets are influenced by relevance and importance of content; labeled tweets provide additional information or context that may encourage further engagement and sharing., Soundness: 0.9

Hypothesis: Retweets are influenced by content quality., Soundness: 0.8

Hypothesis: Retweets are influenced by the content quality of the tweets., Soundness: 0.8

Hypothesis: Hypothetically speaking, the labelled tweet has a higher chance of being retweeted because it is tagged with an explanation or context about its content, making it easier for followers to understand and appreciate the message., Soundness: 0.8

Hypothesis: The labelled tweet is retweeted more likely because it provides additional context or information that makes it more interesting or relevant to the audience., Soundness: 0.9

Hypothesis: Retweets are influenced by relevance and importance of the content, regardless of whether the content is labeled or not., Soundness: 0.9

Hypothesis: Retweets are influenced by the presence of visuals or external links within a tweet., Soundness: 0.8

Hypothesis: Retweets are influenced by the quality and relevance of the content, regardless of the label., Soundness: 1.0

Mean Soundness: 0.85

