# Embedding

In [1]:
from datasets import load_dataset
import pandas as pd
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = str(1)
device = torch.device("cpu")

# MHS 10354
dataset = load_dataset("ucberkeley-dlab/measuring-hate-speech")
df = pd.DataFrame(dataset['train'])
filtered_df = df[df['hate_speech_score'] > 0.5]
contexts = filtered_df['text'].tolist()
contexts1 = list(set(contexts))

# Hatemoderate
df = pd.read_csv('../postprocess/all_examples_0601_hate.csv', sep="\t")
contexts2 = df['sentence'].tolist()

# hatexplain 4384
dataset = load_dataset("hatexplain")
filtered_data = []
for entry in dataset["train"]:
    labels = entry['annotators']['label']
    count_of_2 = labels.count(2)
    if count_of_2 >= len(labels) / 2:
        filtered_data.append(entry)
post_tokens_list = [entry['post_tokens'] for entry in filtered_data]
contexts3 = [' '.join(tokens) for tokens in post_tokens_list]

# tweets_hate_speech_detection
dataset = load_dataset("tweets_hate_speech_detection")
contexts5 = [sent for sent, label in zip(dataset['train']['tweet'], dataset['train']['label']) if label == 1]

# comments
df = pd.read_csv('../raw_datasets/comments.csv', sep = ',' , error_bad_lines=False, header=None, names=['0', '1'])
contexts6 = df['1'].tolist()

# davidson 4k
df = pd.read_csv('../raw_datasets/davidson.csv', sep=",", error_bad_lines=False)
df = df[df['hate_speech'] != 0]
contexts7 = df['tweet'].tolist()

# dynahate 22175
df = pd.read_csv('../raw_datasets/dynahate.csv', sep=",", error_bad_lines=False)
df = df[df['label'] == 'hate']
contexts8 = df['text'].tolist()
len(contexts8)

# civil_comments
# dataset = load_dataset("civil_comments")
# contexts4 = df['sentence'].tolist()

combined_contexts = contexts1 + contexts2 + contexts3 + contexts5 + contexts6 + contexts7 + contexts8

df_combined = pd.DataFrame({'sentence': combined_contexts})
contexts = df_combined['sentence'][:].tolist()
contexts = list(set(contexts)) # size 60235

Found cached dataset parquet (/data/jzheng36/huggingface/datasets/ucberkeley-dlab___parquet/ucberkeley-dlab--measuring-hate-speech-c32713cabe528196/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [4]:
df_contexts = pd.DataFrame({'contexts': contexts})
df_contexts.to_csv('contexts.csv', index=False)

In [5]:
import pandas as pd
import torch
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import os
from tqdm import trange, tqdm
from datasets import load_dataset

dataset = load_dataset("ucberkeley-dlab/measuring-hate-speech")



# df = pd.read_csv('../postprocess/all_examples_0601_hate.csv', sep = "\t")


os.environ["CUDA_VISIBLE_DEVICES"] = str(1)
device = torch.device("cuda")

context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')

context_model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to(device)
question_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base').to(device)

#context_model.load_state_dict(torch.load("context_model.pth"))
#question_model.load_state_dict(torch.load("question_model.pth"))

# contexts = df['sentence'][:].tolist()

# dataset = load_dataset("ucberkeley-dlab/measuring-hate-speech")
# train_data = dataset['train']
# df = pd.DataFrame(train_data)
# filtered_df = df[df['hate_speech_score'] > 0.5]
# contexts = filtered_df['text'].tolist()
# contexts = set(contexts)
all_context_embeddings = []

# Tokenize and compute embeddings in batches
batch_size=1024
for i in trange(0, len(contexts), batch_size):
    batch_contexts = contexts[i: i+batch_size]
    context_input_ids = context_tokenizer(batch_contexts, return_tensors="pt", padding=True, truncation=True, max_length=128)["input_ids"].to(device)
    with torch.no_grad():
        context_embeddings = context_model(context_input_ids).pooler_output
    all_context_embeddings.append(context_embeddings)

all_context_embeddings = torch.cat(all_context_embeddings, dim=0)
torch.save(all_context_embeddings, 'all_context_embeddings.pth')

Setting ds_accelerator to cuda (auto detect)


Found cached dataset parquet (/data/jzheng36/huggingface/datasets/ucberkeley-dlab___parquet/ucberkeley-dlab--measuring-hate-speech-c32713cabe528196/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.weight', 'ctx_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

# Loading

In [1]:
import pandas as pd
import torch
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import os
from tqdm import trange, tqdm
from datasets import load_dataset

os.environ["CUDA_VISIBLE_DEVICES"] = str(1)
device = torch.device("cuda")


def retrieve_passages(query, all_context_embeddings):
    """
    Retrieve top 5 passages that match the given query using DPR.
    :param query: The query string.
    :param contexts: List of context passages.
    :param batch_size: Size of batches for processing.
    :return: Top 5 matching passages.
    """


    # Tokenize the query and compute its embedding
    query_input_ids = question_tokenizer(query, return_tensors="pt")["input_ids"].to(device)
    with torch.no_grad():
        query_embeddings = question_model(query_input_ids).pooler_output

    # Compute similarity scores
    scores = torch.matmul(query_embeddings, all_context_embeddings.T).squeeze()
    
    print(scores)

    # Get the indices of top 5 scores
    top_indices = torch.argsort(scores, descending=True)[:10].tolist()

    top_scores = scores[top_indices].tolist()

    # Get the top 10 contexts based on the indices
    top_contexts = [contexts[idx] for idx in top_indices]

    return top_contexts, top_scores

Setting ds_accelerator to cuda (auto detect)


In [5]:
df = pd.read_csv('contexts.csv')
contexts = df['contexts'].tolist()

In [6]:
all_context_embeddings = torch.load('all_context_embeddings.pth')
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')

context_model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to(device)
question_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base').to(device)

# context_model.load_state_dict(torch.load("context_model.pth"))
# question_model.load_state_dict(torch.load("question_model.pth"))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.weight', 'ctx_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

# Test

In [7]:
query = "Dehumanize women as certain objects"
scores, matching_passages = retrieve_passages(query, all_context_embeddings)
for passage, score in zip(scores, matching_passages):
    print(passage,score)

tensor([51.8427, 54.2000, 53.8861,  ..., 53.1062, 52.7941, 42.4557],
       device='cuda:0')
women are objects to be used and abused 82.40365600585938
Women with masculine spirits are trash. 80.57473754882812
Impregnate you violently and then sodomize you 79.72418212890625
Women in general are more likely to develop degradation kinks because of societal expectations and stereotypes. it's fucked up but being seen as inferior can cause them to cope by fetishizing it. 79.17147064208984
If a man did this to a woman, it would be rape. Hypocrisy, thy name is woman. 78.76933288574219
sexism may be harmful to men's mental health...#men  #woman   78.10639953613281
women are like beetles; they are gross and disgusting 77.6471939086914
Females that are lustful and sexually degenerate are the ugliest of women. 77.6098403930664
Women justify a lot of the awful shit they do to men (divorce rape, taking full custody when the man demonstrates he's a good father, false sexual misconduct accusations) by

In [26]:
query = "Mocking or attacking the beliefs of muslims"
scores, matching_passages = retrieve_passages(query, all_context_embeddings)
for passage, score in zip(scores, matching_passages):
    print(passage,score)

tensor([48.4239, 55.8075, 56.7317,  ..., 54.8288, 60.5673, 48.6825],
       device='cuda:0')
muslim ideology is a crippled, low-resolution, cynical look at human nature life and the world we inhabit. 85.03131866455078
Islam instructs people to be vicious animals and literally forbids any forms of science and human decency. 84.15936279296875
or Muslim men being aggressive & poignantly sneering and rude when others are not looking. 84.15425109863281
Caustic, targeted humour is funny - particularly when aimed at politicians or political events. But it has a downside: it tends to defuse outrage at political indecency or dangerous stupidity.  When people should be genuinely offended, and angry enough to object and protest, they often end up snickering at what a comedian said about it. And the humour - the wisecrack reactions to the situation - can become more important than the issue being criticized. 83.82343292236328
Offensive cartoons about Mohammed have upset many nations in which Islam

# Dataset Analysis

In [1]:
from datasets import load_dataset
import pandas as pd

# Load the first dataset from 'datasets' library
dataset = load_dataset("ucberkeley-dlab/measuring-hate-speech")
contexts1 = dataset["train"]["text"]

# Load the second dataset from a CSV file using pandas
df = pd.read_csv('../postprocess/all_examples_0601_hate.csv', sep="\t")
contexts2 = df['sentence'].tolist()

# Combine the two lists of text
combined_contexts = contexts1 + contexts2

# If you want to create a new DataFrame for combined data
df_combined = pd.DataFrame({'text': combined_contexts})
df_combined

Found cached dataset parquet (/data/jzheng36/huggingface/datasets/ucberkeley-dlab___parquet/ucberkeley-dlab--measuring-hate-speech-c32713cabe528196/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text
0,Yes indeed. She sort of reminds me of the elde...
1,The trans women reading this tweet right now i...
2,Question: These 4 broads who criticize America...
3,It is about time for all illegals to go back t...
4,For starters bend over the one in pink and kic...
...,...
140202,It is not COVID-19 but #WuhanVirus
140203,@HuXijin_GT Stay hygienic in your eating habit...
140204,@Gailyfleur @nytimes Communism created wet mar...
140205,@CMOMaharashtra Why dont ya say something abou...


In [1]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("ucberkeley-dlab/measuring-hate-speech")
train_data = dataset['train']
df = pd.DataFrame(train_data)
df

Found cached dataset parquet (/data/jzheng36/huggingface/datasets/ucberkeley-dlab___parquet/ucberkeley-dlab--measuring-hate-speech-c32713cabe528196/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,annotator_religion_hindu,annotator_religion_jewish,annotator_religion_mormon,annotator_religion_muslim,annotator_religion_nothing,annotator_religion_other,annotator_sexuality_bisexual,annotator_sexuality_gay,annotator_sexuality_straight,annotator_sexuality_other
0,47777,10873,3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
1,39773,2790,2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,47101,3379,3,4.0,4.0,4.0,4.0,4.0,4.0,0.0,...,False,False,False,False,True,False,False,False,True,False
3,43625,7365,3,2.0,3.0,2.0,1.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
4,12538,488,0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135551,37080,8590,2,1.0,1.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
135552,22986,8303,2,2.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,True,True,False,False,False
135553,21008,6207,2,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
135554,22986,7886,2,2.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,True,False,False,False,True,False


In [2]:
filtered_df = df[df['hate_speech_score'] > 0.5]
contexts = filtered_df['text'].tolist()
unique_contexts = set(contexts)
duplicate_count = len(contexts) - len(unique_contexts)

print(f"Number of duplicate entries: {duplicate_count}")

Number of duplicate entries: 38694


In [25]:
len(set(contexts))

10354

In [4]:
import pandas as pd
import torch
from transformers import (DPRContextEncoder, DPRContextEncoderTokenizer,
                          DPRQuestionEncoder, DPRQuestionEncoderTokenizer,
                          DPRReader, DPRReaderTokenizer)

context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
context_model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
context_model.eval()

question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
question_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
question_model.eval()
    
def retrieve_and_extract(query, contexts):
    """
    Retrieve top 5 passages and extract answers from them using DPR.
    :param query: The query string.
    :param contexts: List of context passages.
    :return: Extracted answers and their corresponding passages.
    """

    encoded_contexts = context_tokenizer(contexts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    context_embeddings = context_model(**encoded_contexts).pooler_output
    context_embeddings = context_embeddings / context_embeddings.norm(dim=1, keepdim=True)


    encoded_query = question_tokenizer(query, return_tensors="pt")
    query_embeddings = question_model(**encoded_query).pooler_output
    query_embeddings = query_embeddings / query_embeddings.norm(dim=1, keepdim=True)

    # similarity scores
    scores = torch.matmul(query_embeddings, context_embeddings.T).squeeze()
    top_indices = torch.argsort(scores, descending=True)[:5].tolist()
    top_passages = [contexts[idx] for idx in top_indices]

    # Extract answers using DPRReader
    reader_tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
    reader = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
    reader.eval()

    inputs = reader_tokenizer(
        questions=[query] * len(top_passages),
        texts=top_passages,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=128
    )

    outputs = reader(**inputs)
    combined_logits = outputs.start_logits + outputs.end_logits
    best_passage_idx = torch.argmax(combined_logits.sum(dim=1)).item()

    start_idx = torch.argmax(outputs.start_logits[best_passage_idx]).item()
    end_idx = torch.argmax(outputs.end_logits[best_passage_idx]).item()

    answer = reader_tokenizer.decode(inputs.input_ids[best_passage_idx][start_idx:end_idx + 1], skip_special_tokens=True)

    return answer, top_passages[best_passage_idx]

df = pd.read_csv('../postprocess/all_examples_0601_hate.csv', sep="\t")

query = "Content mocking someone for their personality, opinions, character or emotional state"
answer, relevant_passage = retrieve_and_extract(query, df['sentence'][:1000].tolist())

print(f"Query: {query}")
print(f"Answer: {answer}")
print(f"Relevant Passage: {relevant_passage}")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

Query: Content mocking someone for their personality, opinions, character or emotional state
Answer: racist
Relevant Passage: Blah blah blah, even during a deadly pandemic there's still morons like this ngr playing the victim, Men are also disproportionately killed by coronavirus so not only is it racist it's also sexist ......
