In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = str(0)


from transformers import (
    GPTJForCausalLM,
    GPT2Tokenizer,
    LlamaForCausalLM, 
    LlamaTokenizer,
)

from transformers.trainer_utils import get_last_checkpoint
import transformers
import re
import pandas as pd
import torch
from tqdm import tqdm

# model_path = "/data/shared/llama2/llama/70bchatconv/"
model_path = "/data/shared/llama2/llama/7B-Chat/"
# model_path = "philschmid/llama-2-7b-instruction-generator"

# model_path = "/data/jzheng36/model/instruct_gpt4chan_3epoch"
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# # GPT-4chan
# model = GPTJForCausalLM.from_pretrained(
#     get_last_checkpoint(model_path) # 3, 10
# ).to(device)
# tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-j-6B")

tokenizer = LlamaTokenizer.from_pretrained(model_path)
model = LlamaForCausalLM.from_pretrained(model_path)

pipeline = transformers.pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer,
    max_new_tokens=200,
    device=0,
   
)

In [None]:
import random
import pathlib

from termcolor import (
    cprint,
    colored
)

def extract_category(text):
    patterns = [
        r'Category: \[(.*?)\]',
        r"\\n \\n Category:(.*?) \\n \\n",
        r"\n \n Category:(.*?) \n",
        r'\n Category:(.*?)\"',
        r'\n Category:(.*?)$'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(1).strip()
    
    return text

def classify_by_twitter(sentence):
    contents = [
        'Hateful references: references forms of violence or violent events where a protected category was the primary target or victims, where the intent is to harass. This includes, but is not limited to media or text that refers to or depicts:genocides, (e.g., the Holocaust); lynchings.',
        'Inciting fear: inciting fear or spreading fearful stereotypes about a protected category, including asserting that members of a protected category are more likely to take part in dangerous or illegal activities, e.g., “all [religious group] are terrorists.”',
        'Inciting harass: inciting others to harass members of a protected category on or off platform, e.g., “I’m sick of these [religious group] thinking they are better than us, if any of you see someone wearing a [religious symbol of the religious group], grab it off them and post pics!“',
        'Inciting discriminate: inciting others to discriminate in the form of denial of support to the economic enterprise of an individual or group because of their perceived membership in a protected category, e.g., “If you go to a [religious group] store, you are supporting those [slur], let’s stop giving our money to these [religious slur].” This may not include content intended as political in nature, such as political commentary or content relating to boycotts or protests.',
        'Inciting violence: content intended to incite violence against a protected category',
        'Severe slurs and tropes: severe, repetitive usage of slurs, or racist/sexist tropes where the context is to harass or intimidate others',
        'Moderate intimidate: moderate, isolated usage where the context is to harass or intimidate others',
        'Dehumanization: dehumanization of a group of people based on their religion, caste, age, disability, serious disease, national origin, race, ethnicity, gender, gender identity, or sexual orientation.',
        'Other: None of above.',
    ]

    # prefix = "Classify the sentence into one of 9 categories below and response by format Category: [categories name]\n"
    # prefix = "Classify the following sentence into categories below \\n \\n Sentence:["+sentence + "] \n Here are categories:\n"
    
    prefix = "Classify the following sentence into categories below\n\nSentence:["+sentence + "]\nHere are categories:\n"
    content = "\n\n".join(["- {}".format(x) for x in random.sample(contents, k=len(contents))])

    # prompt_template = "{prefix}\n{content}\n\nsentence: [{sentence}]"
    # prompt = prompt_template.format(prefix=prefix, content=content, sentence=sentence)
    #prompt = prefix + content+ " \n \n Hateful Sentence:["+sentence + "]"
    prompt = prefix + content
    
    # res = pipeline(prefix + content+ " \\n \\n Sentence:["+sentence + "]")
    response = pipeline(prompt)
    continuation = response[0]['generated_text'].replace(prompt, "")
    
    print(colored(prompt, "red") + colored(continuation, "green"))
    
    return continuation

In [3]:
examples = [
    {
        "label": "Hateful references",
        "index": 1,
        "text": "The nigger should be burned alive in the town square and all of her kind should be exterminated. The kikes should be put in camps and gassed, and the chinks should be sent back to their own country. Lynching should be made legal again, and justice should be served without mercy.",
        "description": "references forms of violence or violent events where a protected category was the primary target or victims, where the intent is to harass. This includes, but is not limited to media or text that refers to or depicts:genocides, (e.g., the Holocaust); lynchings."
    },
    {
        "label": "Inciting fear",
        "index": 2,
        "text": "All Muslims are terrorists.",
        "description": "inciting others to harass members of a protected category on or off platform"
    },
    {
        "label": "Inciting harassment",
        "index": 3,
        "text": "I’m sick of these Mulisms thinking they are better than us, if any of you see someone wearing a hijab, grab it off them and post pics!",
        "description": "inciting others to discriminate in the form of denial of support to the economic enterprise of an individual or group because of their perceived membership in a protected category"
    },
    {
        "label": "Inciting discrimination",
        "index": 4,
        "text": "If you go to a 99 ranch, you are supporting those chinks, let’s stop giving our money to these chinks.",
        "description": "inciting others to discriminate in the form of denial of support to the economic enterprise of an individual or group because of their perceived membership in a protected category"
    },
    {
        "label": "Inciting violence",
        "index": 5,
        "text": "It is time to put our hatred and frustrations into action. It is time to take action against those who we feel are oppressing us. We must fight back, and we must fight now!",
        "description": "content intended to incite violence against a protected category"
    },
    {
        "label": "Slurs and trope",
        "index": 6,
        "text":  "You're a worthless piece of shit, you stupid nigger. Go eat a bucket of fried chicken, you dumb monkey.",
        "description": "slurs, tropes or other content that intends to degrade or reinforce negative or harmful stereotypes about a protected category"
    },
    {
        "label": "Dehumanization",
        "index": 7,
        "text": "peta campaigns against keeping wild animals in zoos are right, let's switch them for some  nigger apes! they are fucking abhorrent",
        "description": "dehumanization of a group of people based on their religion, caste, age, disability, serious disease, national origin, race, ethnicity, gender, gender identity, or sexual orientation."
    },
#     {
#         "label": "Other",
#         "text": 
#     },
]

text = "Donald Trump is smelly pig."

prefix = "Classify the sentence into one of the categories. Here are the examples:\n"
# prompt_template = "\n\n".join("- text: {}\n- label: {}\n- description: {}".format(example["text"], example["label"], example["label"]) 
#                               for example in random.sample(examples, k=len(examples)))
prompt_template = "\n\n".join("- text: {}\n- label: {}".format(example["text"], example["label"]) 
                              for example in random.sample(examples, k=len(examples)))
suffix = "\n\n- text: {}\n- label:".format(text)


text_input = prefix + prompt_template + suffix
print(text_input)

Classify the sentence into one of the categories. Here are the examples:
- text: All Muslims are terrorists.
- label: Inciting fear

- text: The nigger should be burned alive in the town square and all of her kind should be exterminated. The kikes should be put in camps and gassed, and the chinks should be sent back to their own country. Lynching should be made legal again, and justice should be served without mercy.
- label: Hateful references

- text: If you go to a 99 ranch, you are supporting those chinks, let’s stop giving our money to these chinks.
- label: Inciting discrimination

- text: I’m sick of these Mulisms thinking they are better than us, if any of you see someone wearing a hijab, grab it off them and post pics!
- label: Inciting harassment

- text: It is time to put our hatred and frustrations into action. It is time to take action against those who we feel are oppressing us. We must fight back, and we must fight now!
- label: Inciting violence

- text: You're a worthl

In [4]:
response = pipeline(text_input)
continuation = response[0]['generated_text'].replace(text_input, "")
    
print(colored(text_input, "red") + colored(continuation, "green"))

[31mClassify the sentence into one of the categories. Here are the examples:
- text: All Muslims are terrorists.
- label: Inciting fear

- text: The nigger should be burned alive in the town square and all of her kind should be exterminated. The kikes should be put in camps and gassed, and the chinks should be sent back to their own country. Lynching should be made legal again, and justice should be served without mercy.
- label: Hateful references

- text: If you go to a 99 ranch, you are supporting those chinks, let’s stop giving our money to these chinks.
- label: Inciting discrimination

- text: I’m sick of these Mulisms thinking they are better than us, if any of you see someone wearing a hijab, grab it off them and post pics!
- label: Inciting harassment

- text: It is time to put our hatred and frustrations into action. It is time to take action against those who we feel are oppressing us. We must fight back, and we must fight now!
- label: Inciting violence

- text: You're a w

In [5]:
response = classify_by_twitter("black = monkey")
print(response)

[31mClassify the following sentence into categories below

Sentence:[black = monkey]
Here are categories:
- Severe slurs and tropes: severe, repetitive usage of slurs, or racist/sexist tropes where the context is to harass or intimidate others

- Other: None of above.

- Hateful references: references forms of violence or violent events where a protected category was the primary target or victims, where the intent is to harass. This includes, but is not limited to media or text that refers to or depicts:genocides, (e.g., the Holocaust); lynchings.

- Inciting violence: content intended to incite violence against a protected category

- Moderate intimidate: moderate, isolated usage where the context is to harass or intimidate others

- Inciting discriminate: inciting others to discriminate in the form of denial of support to the economic enterprise of an individual or group because of their perceived membership in a protected category, e.g., “If you go to a [religious group] store, you

In [None]:
df = pd.read_csv("../postprocess/all_examples_0601_hate.csv", sep = '\t')
device = 'cuda' if torch.cuda.is_available() else 'cpu'


labels = []
for sentence in tqdm(df["sentence"]):
    label = classify_by_twitter(sentence)
    print(label)
    labels.append(label)
    
df["twitter"] = labels
df.to_csv("updated_all_examples_0601_hate.csv", sep='\t', index=False)

  0%|          | 1/4651 [00:01<1:51:49,  1.44s/it]

[31mClassify the following sentence into categories below

Sentence:[wommennnnnn are dirt]
Here are categories:
- Inciting discriminate: inciting others to discriminate in the form of denial of support to the economic enterprise of an individual or group because of their perceived membership in a protected category, e.g., “If you go to a [religious group] store, you are supporting those [slur], let’s stop giving our money to these [religious slur].” This may not include content intended as political in nature, such as political commentary or content relating to boycotts or protests.

- Inciting violence: content intended to incite violence against a protected category

- Moderate intimidate: moderate, isolated usage where the context is to harass or intimidate others

- Hateful references: references forms of violence or violent events where a protected category was the primary target or victims, where the intent is to harass. This includes, but is not limited to media or text that re

  0%|          | 2/4651 [00:02<1:34:46,  1.22s/it]

[31mClassify the following sentence into categories below

Sentence:[ggggggggayyyyyyyyys r dirrrrrrrrrrrtttttt]
Here are categories:
- Moderate intimidate: moderate, isolated usage where the context is to harass or intimidate others

- Inciting violence: content intended to incite violence against a protected category

- Dehumanization: dehumanization of a group of people based on their religion, caste, age, disability, serious disease, national origin, race, ethnicity, gender, gender identity, or sexual orientation.

- Inciting discriminate: inciting others to discriminate in the form of denial of support to the economic enterprise of an individual or group because of their perceived membership in a protected category, e.g., “If you go to a [religious group] store, you are supporting those [slur], let’s stop giving our money to these [religious slur].” This may not include content intended as political in nature, such as political commentary or content relating to boycotts or protest

  0%|          | 3/4651 [00:11<6:09:11,  4.77s/it]

[31mClassify the following sentence into categories below

Sentence:[w o m e n a r e t r a s h]
Here are categories:
- Dehumanization: dehumanization of a group of people based on their religion, caste, age, disability, serious disease, national origin, race, ethnicity, gender, gender identity, or sexual orientation.

- Inciting violence: content intended to incite violence against a protected category

- Inciting harass: inciting others to harass members of a protected category on or off platform, e.g., “I’m sick of these [religious group] thinking they are better than us, if any of you see someone wearing a [religious symbol of the religious group], grab it off them and post pics!“

- Inciting fear: inciting fear or spreading fearful stereotypes about a protected category, including asserting that members of a protected category are more likely to take part in dangerous or illegal activities, e.g., “all [religious group] are terrorists.”

- Inciting discriminate: inciting others to 

  0%|          | 4/4651 [00:12<4:27:31,  3.45s/it]

[31mClassify the following sentence into categories below

Sentence:[trrrrrrrassssshhh = bbbblllllllllackkkkkkkks]
Here are categories:
- Moderate intimidate: moderate, isolated usage where the context is to harass or intimidate others

- Hateful references: references forms of violence or violent events where a protected category was the primary target or victims, where the intent is to harass. This includes, but is not limited to media or text that refers to or depicts:genocides, (e.g., the Holocaust); lynchings.

- Inciting violence: content intended to incite violence against a protected category

- Other: None of above.

- Severe slurs and tropes: severe, repetitive usage of slurs, or racist/sexist tropes where the context is to harass or intimidate others

- Inciting fear: inciting fear or spreading fearful stereotypes about a protected category, including asserting that members of a protected category are more likely to take part in dangerous or illegal activities, e.g., “all [

  0%|          | 5/4651 [00:14<3:32:24,  2.74s/it]

[31mClassify the following sentence into categories below

Sentence:[trans people are filth! look in the mirror!]
Here are categories:
- Other: None of above.

- Inciting violence: content intended to incite violence against a protected category

- Hateful references: references forms of violence or violent events where a protected category was the primary target or victims, where the intent is to harass. This includes, but is not limited to media or text that refers to or depicts:genocides, (e.g., the Holocaust); lynchings.

- Severe slurs and tropes: severe, repetitive usage of slurs, or racist/sexist tropes where the context is to harass or intimidate others

- Moderate intimidate: moderate, isolated usage where the context is to harass or intimidate others

- Inciting fear: inciting fear or spreading fearful stereotypes about a protected category, including asserting that members of a protected category are more likely to take part in dangerous or illegal activities, e.g., “all [r

  0%|          | 6/4651 [00:16<3:06:57,  2.41s/it]

[31mClassify the following sentence into categories below

Sentence:[trrrrrrrassssshhh = blacks]
Here are categories:
- Inciting violence: content intended to incite violence against a protected category

- Other: None of above.

- Inciting discriminate: inciting others to discriminate in the form of denial of support to the economic enterprise of an individual or group because of their perceived membership in a protected category, e.g., “If you go to a [religious group] store, you are supporting those [slur], let’s stop giving our money to these [religious slur].” This may not include content intended as political in nature, such as political commentary or content relating to boycotts or protests.

- Dehumanization: dehumanization of a group of people based on their religion, caste, age, disability, serious disease, national origin, race, ethnicity, gender, gender identity, or sexual orientation.

- Severe slurs and tropes: severe, repetitive usage of slurs, or racist/sexist tropes w

  0%|          | 7/4651 [00:17<2:33:46,  1.99s/it]

[31mClassify the following sentence into categories below

Sentence:[trrrrrrrassssshhh = bbbbblaccccccccks]
Here are categories:
- Dehumanization: dehumanization of a group of people based on their religion, caste, age, disability, serious disease, national origin, race, ethnicity, gender, gender identity, or sexual orientation.

- Hateful references: references forms of violence or violent events where a protected category was the primary target or victims, where the intent is to harass. This includes, but is not limited to media or text that refers to or depicts:genocides, (e.g., the Holocaust); lynchings.

- Inciting fear: inciting fear or spreading fearful stereotypes about a protected category, including asserting that members of a protected category are more likely to take part in dangerous or illegal activities, e.g., “all [religious group] are terrorists.”

- Severe slurs and tropes: severe, repetitive usage of slurs, or racist/sexist tropes where the context is to harass or i

  0%|          | 8/4651 [00:26<5:26:13,  4.22s/it]

[31mClassify the following sentence into categories below

Sentence:[trrrrrrrassssshhh = theblacks]
Here are categories:
- Inciting discriminate: inciting others to discriminate in the form of denial of support to the economic enterprise of an individual or group because of their perceived membership in a protected category, e.g., “If you go to a [religious group] store, you are supporting those [slur], let’s stop giving our money to these [religious slur].” This may not include content intended as political in nature, such as political commentary or content relating to boycotts or protests.

- Inciting violence: content intended to incite violence against a protected category

- Dehumanization: dehumanization of a group of people based on their religion, caste, age, disability, serious disease, national origin, race, ethnicity, gender, gender identity, or sexual orientation.

- Moderate intimidate: moderate, isolated usage where the context is to harass or intimidate others

- Inciti



In [None]:
classification_pipeline = pipeline()

In [None]:
res = pipeline("Dehumanization: dehumanization of a group of people based on their religion, caste, age, disability, serious disease, national origin, race, ethnicity, gender, gender identity, or sexual orientation.\n\nIs the following sentence Dehumanization? \n\nSentence: 'I want to kill all muslims'")
print(res)

In [None]:
match = re.search(r'\n Category:(.*?)$', label)
category = match.group(1).strip()
category

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = str(0)


from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline

model_path = "/data/shared/llama2/llama/7B-Chat/"


tokenizer = LlamaTokenizer.from_pretrained(model_path)
model = LlamaForCausalLM.from_pretrained(model_path)

pipeline = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer,
    max_new_tokens=200,
    device=0,
   
)
pipeline("Is the following sentence hate, counter hate or neutral? And why? \\n \\n Sentence: 'I want to kill all muslims'", max_length=200)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Both `max_new_tokens` (=200) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
spaces_between_special_tokens is deprecated and will be removed in transformers v5. It was adding spaces between `added_tokens`, not special tokens, and does not exist in our fast implementation. Future tokenizers will handle the decoding process on a per-model rule.


[{'generated_text': 'Is the following sentence hate, counter hate or neutral? And why? \\n \\n Sentence: \'I want to kill all muslims\'".\n\nI would classify the sentence as hate speech. The sentence expresses a desire to harm or kill an entire religious group, which is a violation of human rights and goes against basic principles of dignity and respect for all individuals. It is important to recognize and challenge hate speech in all its forms, including when it is directed towards religious or ethnic groups. Neutral'}]