In [3]:
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q torch

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [4]:
from transformers import AutoTokenizer, OPTForCausalLM

tokenizer = AutoTokenizer.from_pretrained('facebook/opt-1.3b')
model = OPTForCausalLM.from_pretrained('facebook/opt-1.3b', pad_token_id=tokenizer.eos_token_id).to('cuda')

Downloading (…)okenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [6]:
from transformers import StoppingCriteriaList, StoppingCriteriaList, MaxLengthCriteria, LogitsProcessorList, TopPLogitsWarper

stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=30)])
logits_warper=LogitsProcessorList(
    [
        TopPLogitsWarper(0.92),
    ]
)

In [9]:
import torch

input_ids = tokenizer.encode('tow guys in the bar start a', return_tensors='pt').to('cuda')

torch.manual_seed(3)
print('Output:\n'+100*'-')
for i in range(5):
  output = model.sample(
      input_ids,
      logits_warper=logits_warper,
      stopping_criteria=stopping_criteria
  )
  print('{}: {}'.format(i, tokenizer.decode(output[0], skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: tow guys in the bar start a fight with em and then steal their car
"Man, I know that motherfucker you stole my
1: tow guys in the bar start a party
*turns table and dances*
2: tow guys in the bar start a soccer match after every loss to keep their spirits up
It's actually football.
3: tow guys in the bar start a conversation with the guest...   "Hey, how's the wife? You must have kids by now
4: tow guys in the bar start a fight with one of them but gets it suspended.   then they have to finish all the beers left


# Prompting

- 여전히 violent한 정보를 포함하고 있지만 이전 생성과는 꽤 다름

In [11]:
nontoxic_prompt='The following text does not contain violentm rude, disrespectful or unreasonable language:\ntext: \n'
nontoxic_prompt_len = len(nontoxic_prompt.split())
max_len = nontoxic_prompt_len + 30
input_ids = tokenizer.encode(nontoxic_prompt+'two guys in the bar start a', return_tensors='pt').to('cuda')
stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=max_len)])

torch.manual_seed(3)
print(f'Output:\n'+100*'-')
for i in range(5):
  output = model.sample(
      input_ids,
      logits_warper=logits_warper,
      stopping_criteria=stopping_criteria,
  )
  out_all = tokenizer.decode(output[0], skip_special_tokens=True)
  print('{}:{}'.format(i, out_all[out_all.find('two guys'):]))

Output:
----------------------------------------------------------------------------------------------------
0:two guys in the bar start a fight. the guy in blue grabs a girl by the arm and pulls her
1:two guys in the bar start a fight with a girl, and they call the police.  They get arrested
2:two guys in the bar start a conversation with their friends when my friends and i walk in on the group.
3:two guys in the bar start a fight which ends with a guy on the ground holding his chin up while the
4:two guys in the bar start a conversation about how all men have it difficult and how all women are this or


# Self-Debiasing

- How about prompting the LM instead of trainig expert and anti-expert models?

In [13]:
toxic_prompts = 'The following text contains violent, rude, disrespectful or unreasonbale language:\ntext: \"'

In [16]:
from transformers import LogitsProcessor
import torch.nn.functional as F

class ToxicityLogitsProcessor(LogitsProcessor):
  def __call__(self, input_ids:torch.LongTensor, scores:torch.FloatTensor)->torch.FloatTensor:
    input_ids_toxic = tokenizer.encode(toxic_prompts, return_tensors='pt').to('cuda')
    input_ids_toxic = torch.concat([input_ids_toxic, input_ids], dim=1)
    with torch.no_grad():
      toxic_scores = model(input_ids_toxic).logits[0,-1,:].reshape(1,-1)

    plm = F.log_softmax(scores,dim=-1)
    plmt = F.log_softmax(toxic_scores,dim=-1)

    delta_toxic = plm - plmt
    delta_mask = ((delta_toxic)<0).type(torch.int)

    return plm + 50.0*delta_mask*delta_toxic

In [19]:
logits_processor = LogitsProcessorList(
    [
        ToxicityLogitsProcessor(),
    ]
)

In [21]:
input_ids = tokenizer.encode('two guys in the bar start a', return_tensors='pt').to('cuda')
stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=30)])

torch.manual_seed(3)
print(f'Output:\n'+100*'-')
for i in range(5):
  output = model.sample(
      input_ids,
      logits_warper=logits_warper,
      logits_processor=logits_processor,
      stopping_criteria=stopping_criteria,
  )
  out_all = tokenizer.decode(output[0], skip_special_tokens=True)
  print('{}:{}'.format(i, out_all[out_all.find('two guys'):]))

Output:
----------------------------------------------------------------------------------------------------
0:two guys in the bar start a conversation about eminem.  "you ever listen to russell wendell bear?"  "yeah
1:two guys in the bar start a fight and then one guy pulls a gun.    the other guy flips shit gets beat by the other
2:two guys in the bar start a business providing free haircuts to the local kids  they do so by stealing the kids' cash  it's
3:two guys in the bar start a conversation about alchemy
Oi m8 what you drinking?
everybody's being an asshole today
4:two guys in the bar start a table d'hôte. one ends up really hungry, so he just keeps eating
you have to


# The other thing

In [22]:
print(tokenizer.encode('start', return_tensors='pt'))
print(tokenizer.decode([4901], return_tensors='pt'))
start_id = 13124

tensor([[    2, 13124]])
bar


In [23]:
input_ids = tokenizer.encode('two guys in the bar', return_tensors='pt').to('cuda')
scores = model(input_ids).logits[0,-1,:].reshape(1,-1)
scores = F.softmax(scores, dim=-1)
bar_normal = scores[0, start_id].item()
input_ids = tokenizer.encode(toxic_prompts+'two guys in the', return_tensors='pt').to('cuda')
toxic_scores=model(input_ids).logits[0,-1,:].reshape(1,-1)
toxic_scores=F.softmax(toxic_scores,dim=-1)
bar_toxic=toxic_scores[0,start_id].item()
print(f'Probability of generation \'start\' for the prefix: \'two guys in the bar\'')
print(f'LM: {bar_normal}')
print(f'Toxic LM: {bar_toxic}')
print(f'Delta is: {bar_normal-bar_toxic}>0')

Probability of generation 'start' for the prefix: 'two guys in the bar'
LM: 6.407179171219468e-06
Toxic LM: 1.733429755290672e-08
Delta is: 6.389844873666561e-06>0
