# This is a code snippet for hypothesis generation benchmark.

 - Base model
 - Hypothesis Generation inference Pipe

## 1 Hypothesis Generation

In [1]:
from transformers import pipeline

# Load the language model pipeline
llm = pipeline("text-generation", model="Qwen/Qwen2.5-3B-Instruct")

  from .autonotebook import tqdm as notebook_tqdm
Downloading shards: 100%|██████████| 2/2 [01:07<00:00, 33.84s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:18<00:00,  9.29s/it]
Device set to use cuda:0


In [21]:
gen_prompt = '''You are a social media expert. 
You are an expert at determining which tweet will be retweeted more. 
Given a set of observations, you want to generation hypotheses that will help predict which tweet out of a pair of tweets is more likely to be retweeted. 
Please note that the paired tweets are about the same content and are posted by the same user, so you should focus on the wording difference between the two tweets in each pair. 
Please propose {num_hypotheses} possible hypotheses. Please generate them in the format of 1. [hypothesis], 2. [hypothesis], ... {num_hypotheses}. [hypothesis]. 
Please make the hypotheses general enough to be applicable to new observations.
We made some observations: 
{examples}
Proposed hypotheses:'''

In [6]:
examples = "1. Tweet A: 'Check out our new product launch!' Tweet B: 'Don't miss our new product launch!'\n2. Tweet A: 'Join us for a webinar on AI.' Tweet B: 'Sign up for our AI webinar now!'"
num_hypotheses = 5
formatted_prompt = gen_prompt.format(examples=examples, num_hypotheses=num_hypotheses)

# Generate hypotheses
hypotheses = llm(formatted_prompt, max_new_tokens=200, num_return_sequences=1)
print(hypotheses[0]['generated_text'])

[{'generated_text': "You are a social media expert. \nYou are an expert at determining which tweet will be retweeted more. \nGiven a set of observations, you want to generation hypotheses that will help predict which tweet out of a pair of tweets is more likely to be retweeted. \nPlease note that the paired tweets are about the same content and are posted by the same user, so you should focus on the wording difference between the two tweets in each pair. \nPlease propose <num_hypotheses> possible hypotheses. Please generate them in the format of 1. [hypothesis], 2. [hypothesis], ... <num_hypotheses>. [hypothesis]. \nPlease make the hypotheses general enough to be applicable to new observations.\nWe made some observations: \n1. Tweet A: 'Check out our new product launch!' Tweet B: 'Don't miss our new product launch!'\n2. Tweet A: 'Join us for a webinar on AI.' Tweet B: 'Sign up for our AI webinar now!'\nProposed hypotheses: \n1. [Tweet with action verb] is more likely to be retweeted th

## 2 Hypothesis inference

In [37]:
infer_prompt = '''
**INSTRUCT**
You are a social media expert. 
Given a pair of tweets, you are asked to predict which tweet will be retweeted more. 
Please note that the paired tweets are about the same content and are posted by the same user, so you should focus on the wording difference between the two tweets. 
From past experiences, you learned a pattern. Now, at each time, you should apply a learned pattern to a pair of tweets and determine which one will get more retweets. 
Given the pattern you learned above, predict which one of the two tweets will get more retweets. 
Think step by step. 
First step: Think about if the pattern can be applied to the tweets. 
Second step: Analyze the textual difference between the two tweets. 
Third step: Based on the pattern, which tweet is more likely to get more retweets? 
Final step: Give your final answer in the format of Final answer: the _ tweet where _ is either first or second. 

**INPUT**
Our learned pattern: {hypothesis_high_reward}
The first tweet: {first_tweet} 
The second tweet: {second_tweet} 

**OUTPUT**
Final answer:'''


In [38]:
learn_hyp = '[Tweet with strong emotion] is more likely to be retweeted than [Tweet without strong emotion]'
first_tweet = '''Derek Carr: "I'm the biggest Fresno St fan there is … [BCS] would be the coolest thing ever." My story from Fresno: http://t.co/3ZeLnut0qq'''
second_tweet = 'PM RT: My column from Fresno State, the aspiring Cinderella that no one outside of Fresno seems to embrace. http://t.co/3ZeLnut0qq'
formatted_prompt = infer_prompt.format(hypothesis_high_reward=learn_hyp, first_tweet=first_tweet, second_tweet=second_tweet)

# Generate hypotheses
hypotheses = llm(formatted_prompt, 
    max_new_tokens=200, 
    num_return_sequences=1,
    temperature=0.9, 
    do_sample=True)
print(hypotheses[0]['generated_text'])

[{'generated_text': '\n**INSTRUCT**\nYou are a social media expert. \nGiven a pair of tweets, you are asked to predict which tweet will be retweeted more. \nPlease note that the paired tweets are about the same content and are posted by the same user, so you should focus on the wording difference between the two tweets. \nFrom past experiences, you learned a pattern. Now, at each time, you should apply a learned pattern to a pair of tweets and determine which one will get more retweets. \nGiven the pattern you learned above, predict which one of the two tweets will get more retweets. \nThink step by step. \nFirst step: Think about if the pattern can be applied to the tweets. \nSecond step: Analyze the textual difference between the two tweets. \nThird step: Based on the pattern, which tweet is more likely to get more retweets? \nFinal step: Give your final answer in the format of Final answer: the _ tweet where _ is either first or second. \n\n**INPUT**\nOur learned pattern: [Tweet wit

## 3 2 in 1

In [40]:
prompt = '''## INSTRUCTION
You are a social media expert. Your task has two parts:

### Part 1: Hypothesis Generation
Given a set of tweet observations, you will generate hypotheses that are useful for predicting which tweet out of a pair will be retweeted more.
- Each tweet pair is posted by the same user and contains similar content with slight wording differences.
- Focus on these wording differences.
- Please generate 1 hypotheses in the format:
  HP: [hypothesis]
- Make your hypotheses general enough to apply to new tweet pairs.

### Part 2: Hypothesis-Based Inference
Using the hypotheses you just generated, apply them to a given pair of tweets.
- Predict which tweet will be retweeted more based on the learned patterns.
- Answer in the format:
  **Final answer: the _ tweet** (where `_` is either `first` or `second`)

Think step by step:
1. Can your hypothesis apply to the tweets?
2. Analyze the textual differences.
3. Decide which tweet is more likely to be retweeted.
4. Provide your final prediction.


## INPUT
- First tweet: {first_tweet}
- Second tweet: {second_tweet}

## OUTPUT
'''


In [49]:
first_tweet = '''How Great is Our God [essential collection] released today...just wish i had room to add a few more:) http://t.co/JXGbJucz'''
second_tweet = '''the (world edition) of How Great is Our God is possibly the best thing i'll be a part of musically...ever http://t.co/JXGbJucz'''


formatted_prompt = prompt.format(first_tweet=first_tweet, second_tweet=second_tweet)

# Generate hypotheses
hypotheses = llm(formatted_prompt, 
    max_new_tokens=200, 
    num_return_sequences=1,
    temperature=0.9, 
    do_sample=True)
print(hypotheses[0]['generated_text'])

## INSTRUCTION
You are a social media expert. Your task has two parts:

### Part 1: Hypothesis Generation
Given a set of tweet observations, you will generate hypotheses that are useful for predicting which tweet out of a pair will be retweeted more.
- Each tweet pair is posted by the same user and contains similar content with slight wording differences.
- Focus on these wording differences.
- Please generate 1 hypotheses in the format:
  HP: [hypothesis]
- Make your hypotheses general enough to apply to new tweet pairs.

### Part 2: Hypothesis-Based Inference
Using the hypotheses you just generated, apply them to a given pair of tweets.
- Predict which tweet will be retweeted more based on the learned patterns.
- Answer in the format:
  **Final answer: the _ tweet** (where `_` is either `first` or `second`)

Think step by step:
1. Can your hypothesis apply to the tweets?
2. Analyze the textual differences.
3. Decide which tweet is more likely to be retweeted.
4. Provide your final pr