In this notebook, we deploy 
- GPT-2,
- GPT-3.5, 
- OPT, 
- BLOOM 

as the generation models and collect the generated sentences.

### Load the data

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
df = pd.read_csv('data/HateXplain/prefix_prompt.csv')

### GPT-2

we use top-1 generation, and the maximum number of generation is 30

In [10]:
# load the pre-trained model
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')

def gpt2_generate(prefix, max_length=30, num_return_sequences=1):
    set_seed(42)
    sentences = generator(prefix, max_length=max_length, num_return_sequences=num_return_sequences)
    return sentences[0]['generated_text']

In [17]:
# generate text for each prefix
tqdm.pandas()
df['generated_text'] = df['text'].progress_apply(lambda x: gpt2_generate(x))

  0%|          | 0/2113 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 2/2113 [00:00<02:46, 12.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 4/2113 [00:00<07:35,  4.63it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 6/2113 [00:01<08:18,  4.23it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 7/2113 [00:01<07:09,  4.90it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 8/2113 [00:01<08:52,  3.96it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 9/2113 [00:02<09:40,  3.62it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 10/2113

In [18]:
# save the generated text
df.to_csv('data/HateXplain/gpt2_prefix.csv', index=False)

In [19]:
df

Unnamed: 0,post_id,text,part,generated_text
0,1179088797964763136_twitter,@769288 i am bit confused coz chinese ppl can ...,prefix,@769288 i am bit confused coz chinese ppl can ...
1,13851720_gab,laura loomer raped me while screaming at me in...,prefix,laura loomer raped me while screaming at me in...
2,19679648_gab,guilty until proven innocent unless you are a,prefix,guilty until proven innocent unless you are a ...
3,23844035_gab,173750 263867 47% of brits think multicultural...,prefix,173750 263867 47% of brits think multicultural...
4,14104384_gab,why does anyone care what,prefix,why does anyone care what I'm doing to create ...
...,...,...,...,...
2108,9731096_gab,i forgot how much talcum xx lies they turned t...,prefix,i forgot how much talcum xx lies they turned t...
2109,9755956_gab,do not worry they are,prefix,"do not worry they are not going anywhere."")"
2110,9798597_gab,flushed with victory after normalizing trannie...,prefix,flushed with victory after normalizing trannie...
2111,9982674_gab,more and more leftists are moving closer to ce...,prefix,more and more leftists are moving closer to ce...


### GPT-j-6B
https://huggingface.co/EleutherAI/gpt-j-6B

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B").to('cuda')

: 

: 

In [None]:
model

### GPT-NeoXT-Chat-Base-20B
https://huggingface.co/togethercomputer/GPT-NeoXT-Chat-Base-20B?text=My+name+is+Lewis+and+I+like+to

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B")
model = AutoModelForCausalLM.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B")


### OPT-66b
https://huggingface.co/facebook/opt-66b

In [22]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

def opt_generate(prefix, max_length=30, num_return_sequences=1):
    model = AutoModelForCausalLM.from_pretrained("facebook/opt-66b", torch_dtype=torch.float16).cuda()
    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-66b", use_fast=False)
    input_ids = tokenizer(prefix, return_tensors="pt").input_ids.cuda()
    generated_ids = model.generate(input_ids,  num_return_sequences=num_return_sequences, max_length=max_length)
    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]


In [23]:
text = "I don't like"
opt_generate(text)

Downloading:   0%|          | 0.00/676 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/89.9k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.85G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.85G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.85G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.85G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.85G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.85G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/6.36G [00:00<?, ?B/s]

: 

: 

### BLOOM
https://huggingface.co/bigscience/bloom

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom").cuda()