In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 61.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 25.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
[K     |████████████████████████████████| 346 kB 5.1 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 42.2 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 9.7 MB/s 
Collecting dill<0.3.5
  Downloading dill-0.3.4-py2.py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.3 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 45.3 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Colle

In [None]:
import re
import numpy as np
import pandas as pd
import datasets
from datasets import load_dataset, Dataset
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm
from transformers import (
    AutoTokenizer,
    BertTokenizer,
    BertForMaskedLM,
    DataCollatorForLanguageModeling,
    TFAutoModelForMaskedLM,
    pipeline,
)
import spacy


In [None]:


CONTENT_ROW = "content"
SCORE = "score"
TOKEN_STRING = "token_str"

DEFAULT_GENDER_IDENTIFIERS = [
    "she",
    "her",
    "hers",
    "woman",
    "women",
    "female",
    "he",
    "his",
    "him",
    "man",
    "men",
    "male",
]

WOMAN_KEYWORDS = ["woman", "women", "female", "she", "her", "hers"]
MAN_KEYWORDS = ["man", "men", "male", "he", "his", "him"]

TOP_K = 100


class Bert:
    def __init__(self, model_checkpoint="bert-base-uncased""):
        """
        Instantiates model and tokenizer based on pretrained bert-base-uncased model.
        Returns:
        tokenizer -- AutoTokenizer for the model
        model -- pretrained BertForMaskedLM
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
        self.model = BertForMaskedLM.from_pretrained(model_checkpoint, from_tf=True)

    def mask_single_gender(self, gender_identifiers=[], input_text=""):
        """
        Masks the input text with the mask_token for the given tokenizer.
        Chooses single, center-most relevant token to mask.
        Arguments:
          tokenizer -- tokenizer to identify the token_str
          gender_identifiers (optional) -- list of identifiers to mask (i.e. ["Megan", "boy", "guy"])
          input_text -- the string to mask
        Returns:
          output_text -- masked version of the input_text
        Example: ("[Mask] should be president!") : {'she' : 0.50, 'he': 0.5}
        """
        if not gender_identifiers:
            gender_identifiers = DEFAULT_GENDER_IDENTIFIERS
        regex = re.compile(r"\b(?:%s)\b" % "|".join(gender_identifiers))
        matches = list(re.finditer(regex, input_text.lower()))

        middle_index = len(input_text) / 2
        single_match_start = 0
        single_match_end = 0
        min_distance = 10000

        if len(matches) == 0:
            return input_text
        elif len(matches) == 1:
            single_match_start = matches[0].start()
            single_match_end = matches[0].end()
        else:
            match_indices = []
            for match in matches:
                match_indices.append((match.start(), match.end()))
            for match_index_tuple in match_indices:
                match_index = int((match_index_tuple[0] + match_index_tuple[1]) / 2)
                current_distance = abs(match_index - middle_index)
                if current_distance < min_distance:
                    min_distance = current_distance
                    single_match_start = match_index_tuple[0]
                    single_match_end = match_index_tuple[1]

        label = input_text[single_match_start:single_match_end].strip()
        input_text = (
            input_text[:single_match_start] + "[MASK]" + input_text[single_match_end:]
        )
        return input_text, label

    def mask_gender(self, gender_identifiers=[], input_text=""):
        """
        Masks the input text with the mask_token for the given tokenizer
        Arguments:
          tokenizer -- tokenizer to identify the token_str
          gender_identifiers (optional) -- list of identifiers to mask (i.e. ["Megan", "boy", "guy"])
          input_text -- the string to mask
        Returns:
          output_text -- masked version of the input_text
        Example: ("[Mask] should be president!") : {'she' : 0.50, 'he': 0.5}
        """
        if not gender_identifiers:
            gender_identifiers = DEFAULT_GENDER_IDENTIFIERS
        regex = re.compile(r"\b(?:%s)\b" % "|".join(gender_identifiers))
        return regex.sub(self.tokenizer.mask_token, input_text)

    def split_to_contexts(self, eval_dataset, context_size=100):
        concat_text = " ".join(eval_dataset)
        words = concat_text.split()
        grouped_words = [
            " ".join(words[i : i + context_size])
            for i in range(0, len(words), context_size)
        ]
        return grouped_words

    def read_eval_data(self, dataset, downsample=False):
        eval_dataset = dataset["validation"]
        # Downsample if running on colab
        if downsample:
            downsampled_dataset = dataset["validation"].train_test_split(
                test_size=100, seed=42
            )
            eval_dataset = downsampled_dataset["test"]
        repartitioned = self.split_to_contexts(eval_dataset[CONTENT_ROW])
        eval_dataset_df = pd.DataFrame({"content": repartitioned})
        return eval_dataset_df

    def compute_single_prob(self, predictions):
        woman_prob_numerator = 0
        man_prob_numerator = 0
        all_gender_denominator = 0
        for prediction in predictions:
            if prediction[TOKEN_STRING] in WOMAN_KEYWORDS:
                woman_prob_numerator += prediction[SCORE]
                all_gender_denominator += prediction[SCORE]
            if prediction[TOKEN_STRING] in MAN_KEYWORDS:
                man_prob_numerator += prediction[SCORE]
                all_gender_denominator += prediction[SCORE]
        if all_gender_denominator == 0:
            woman_prob = 0
            man_prob = 0
        else:
            woman_prob = woman_prob_numerator / all_gender_denominator
            man_prob = man_prob_numerator / all_gender_denominator
            assert woman_prob + man_prob == 1.0
        return woman_prob, man_prob

    def compute_probs(self, predictions):
        """
        Computes normalized gender probability given a list of predictions
        (corresponding to a single context)
        Arguments:
          predictions -- list of predictions output for a single context
        Returns:
          output_text -- woman_prob, man_prob
        """
        woman_prob = 0
        man_prob = 0
        if len(predictions) != TOP_K:
            woman_prob_list = []
            man_prob_list = []
            for prediction in predictions:
                woman_prob, man_prob = self.compute_single_prob(prediction)
                woman_prob_list.append(woman_prob)
                man_prob_list.append(man_prob)
            woman_prob = np.mean(woman_prob_list)
            man_prob = np.mean(man_prob_list)
        else:
            woman_prob, man_prob = self.compute_single_prob(predictions)
        return woman_prob, man_prob

    def evaluate(self, eval_df):
        model_fn = pipeline("fill-mask", model=self.model, tokenizer=self.tokenizer)
        predictions = []
        woman_probs = []
        man_probs = []
        for prediction in tqdm(model_fn(KeyDataset(eval_df, "content"), top_k=TOP_K)):
            (woman_prob, man_prob) = bert.compute_probs(prediction)
            woman_probs.append(woman_prob)
            man_probs.append(man_prob)

        probability_output = pd.DataFrame(
            {
                "content": eval_df["content"],
                "label": eval_df["label"],
                "female_probs": woman_probs,
                "male_probs": man_probs,
            }
        )

        return probability_output

CONSERVATIVE v3 TEST SET

In [None]:
# bert = Bert(model_checkpoint = "jbreuch/bert-news-v2")
bert = Bert()
conversative_dataset = load_dataset("myradeng/cs-230-news-v3-test-conservative")
conversative_dataset = bert.read_eval_data(conversative_dataset, False)

All TF 2.0 model weights were used when initializing BertForMaskedLM.

Some weights of BertForMaskedLM were not initialized from the TF 2.0 model and are newly initialized: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
conversative_dataset

Unnamed: 0,content
0,But it is also very special for me and for my ...
1,Sharon Pryse said. S. That court is one of sev...
2,who has been traumatized. Mayim Bialik shares ...
3,"exactly right, Maria. It will take and I read ..."
4,Clinton’s public appearances she has made stri...
...,...
1794,nominee Joe Manchik . It does not take anythin...
1795,school forced him to remove Jesus references f...
1796,"disputes over pay and workplace conditions, a ..."
1797,want to be evacuated from Quneitra to Turkey o...


In [None]:
from dataclasses import replace
import en_core_web_sm
nlp = spacy.load("en_core_web_sm")

def filter_names(dataset):
  filtered_dataset = dataset.copy()
  for idx, row in filtered_dataset.iterrows():
    content = row["content"]
    parsed = nlp(content)
    new_content = content
    ents = [e.text for e in parsed.ents if e.label_ == 'PERSON']
    for ent in ents:
      new_content = new_content.replace(ent, 'someone')
    if (new_content != content):
      filtered_dataset.loc[idx, "content"] = new_content
    return filtered_dataset

In [None]:
filtered_conservative_dataset = filter_names(conversative_dataset)

In [None]:
def get_probability_output(filtered_dataset):
  for idx, row in filtered_dataset.iterrows():
    output = bert.mask_single_gender(input_text=row["content"])
    filtered_dataset.loc[idx, "content"] = output[0]
    filtered_dataset.loc[idx, "label"] = output[1]
  filtered_dataset = filtered_dataset[filtered_dataset["content"].str.contains("\[MASK\]")]
  filtered_dataset = Dataset.from_pandas(filtered_dataset)
  probability_output = bert.evaluate(filtered_dataset)
  return probability_output

In [None]:
conservative_probability_output = get_probability_output(filtered_conservative_dataset)

  0%|          | 0/1428 [00:00<?, ?it/s]

In [None]:
def get_all_male_all_female_outputs(probability_output):
  # Get ALL (aggregate level) CONDITIONAL probabilities 
  # Make sure this code stays the same
  # In our milestone report, percentages for both all_male and all_female were in the ~70s range
  all_male = np.mean(probability_output[~probability_output["label"].apply(lambda x: any([re.search(k, x) for k in WOMAN_KEYWORDS]))]['male_probs'])
  all_female = np.mean(probability_output[probability_output["label"].apply(lambda x: any([re.search(k, x) for k in WOMAN_KEYWORDS]))]['female_probs'])
  print(all_male, all_female)

In [None]:
get_all_male_all_female_outputs(conservative_probability_output)

0.8115428445738304 0.651065909537877


In [None]:
low_female_prob = probability_output[probability_output["label"].apply(lambda x: any([re.search(k, x)  for k in WOMAN_KEYWORDS]))]
low_female_prob = low_female_prob[~low_female_prob["content"].str.contains("&apos")]
low_female_prob = low_female_prob[~low_female_prob["content"].str.contains("[MASK] will")]
low_female_prob = low_female_prob[low_female_prob["male_probs"] != 0.0]
#low_female_prob
print("\n".join(low_female_prob[low_female_prob["female_probs"] < 0.3]["content"]))

In [None]:
STRENGTH = ["power", "strong", "confident", "dominant", "potent", "command", "assert", "loud", "bold", "succeed",
"triumph", "leader", "shout", "dynamic", "winner"]
WEAK = ["weak", "surrender", "timid", "vulnerable", "weakness", "wispy", "withdraw", "yield", "failure", "shy",
"follow", "lose", "fragile", "afraid", "loser"]

INTELLIGENCE = ["precocious", "resourceful", "inquisitive", "genius", "inventive", "astute", "adaptable", "reflective",
"discerning", "intuitive", "inquiring", "judicious", "analytical", "apt", "venerable", "imaginative",
"shrewd", "thoughtful", "wise", "smart", "ingenious", "clever", "brilliant", "logical", "intelligent"]
APPEARANCE = ["alluring", "voluptuous", "blushing", "homely", "plump", "sensual", "gorgeous", "slim", "bald",
"athletic", "fashionable", "stout", "ugly", "muscular", "slender"]

CAREER = ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"]
FAMILY = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]

In [None]:
def compute_female_probs(df):
  return np.mean(df[df["label"].apply(lambda x: any([re.search(k, x)  for k in WOMAN_KEYWORDS]))]['female_probs'])

def compute_male_probs(df):
  return np.mean(df[~df["label"].apply(lambda x: any([re.search(k, x)  for k in WOMAN_KEYWORDS]))]['male_probs']) 

In [None]:
def get_contexts(probability_output):
  strong_contexts = probability_output[probability_output["content"].apply(lambda x: any([re.search(k.center(len(k) + 1), x) for k in STRENGTH]))]
  weak_contexts = probability_output[probability_output["content"].apply(lambda x: any([re.search(k.center(len(k) + 1), x) for k in WEAK]))]

  intelligence_contexts = probability_output[probability_output["content"].apply(lambda x: any([ re.search(k.center(len(k) + 1), x) for k in INTELLIGENCE]))]
  appearance_contexts = probability_output[probability_output["content"].apply(lambda x: any([ re.search(k.center(len(k) + 1), x)  for k in APPEARANCE]))]

  career_contexts = probability_output[probability_output["content"].apply(lambda x: any([ re.search(k.center(len(k) + 1), x) for k in CAREER]))]
  family_contexts = probability_output[probability_output["content"].apply(lambda x: any([ re.search(k.center(len(k) + 1), x) for k in FAMILY]))]

  # Get strong/weak, intelligence/appearance, career/family CONDITIONAL probabilities 

  strong_male = compute_male_probs(strong_contexts)
  strong_female = compute_female_probs(strong_contexts)
  weak_male = compute_male_probs(weak_contexts)
  weak_female = compute_female_probs(weak_contexts)

  intelligent_male = compute_male_probs(intelligence_contexts)
  intelligent_female = compute_female_probs(intelligence_contexts)

  appearance_male = compute_male_probs(appearance_contexts)
  appearance_female = compute_female_probs(appearance_contexts)

  career_male = compute_male_probs(career_contexts)
  career_female = compute_female_probs(career_contexts)

  family_male = compute_male_probs(family_contexts)
  family_female = compute_female_probs(family_contexts)
  print(strong_male, strong_female)
  print(weak_male, weak_female)
  print(intelligent_male, intelligent_female)
  print(appearance_male, appearance_female)
  print(career_male, career_female)
  print(family_male, family_female)
  

In [None]:
get_contexts(conservative_probability_output)

0.8077857052462356 0.544947325318839
0.8034377750153864 0.723895542705293
0.8524253058307766 0.6122669114420107
0.7328527022875401 0.026504995874084034
0.8051615160405249 0.6047560415114603
0.7884239721659485 0.6434763642903474


MODERATE v3 TEST SET

In [None]:
moderate_dataset = load_dataset("myradeng/cs-230-news-v3-test-moderate")
moderate_dataset = bert.read_eval_data(moderate_dataset, False)



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
moderate_dataset

Unnamed: 0,content
0,WASHINGTON (Reuters) - The YOU. Consumption wi...
1,behind the pressure campaign. The Food Network...
2,matters. Two Federal Reserve officials sounded...
3,"a White House-allied group, will hold a news c..."
4,"position, and military capabilities of these t..."
...,...
1980,a target is set in absolute emission terms or ...
1981,"hacked by an entity calling itself “OurMine”, ..."
1982,attract tech companies. “I have done qualifyin...
1983,"do, they explicitly did not attempt to dictate..."


In [None]:
filtered_moderate_dataset = filter_names(moderate_dataset)

In [None]:
filtered_moderate_dataset

Unnamed: 0,content
0,WASHINGTON (Reuters) - The YOU. Consumption wi...
1,behind the pressure campaign. The Food Network...
2,matters. Two Federal Reserve officials sounded...
3,"a White House-allied group, will hold a news c..."
4,"position, and military capabilities of these t..."
...,...
1980,a target is set in absolute emission terms or ...
1981,"hacked by an entity calling itself “OurMine”, ..."
1982,attract tech companies. “I have done qualifyin...
1983,"do, they explicitly did not attempt to dictate..."


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [None]:
moderate_probability_output = get_probability_output(filtered_moderate_dataset)

  0%|          | 0/1177 [00:00<?, ?it/s]

In [None]:
get_all_male_all_female_outputs(moderate_probability_output)

0.84385156635425 0.5517081161657977


In [None]:
get_contexts(moderate_probability_output)

0.8307706599438801 0.5655566114146976
0.8107226053841249 0.5926274854326233
0.8145318244295853 0.15147325405969786
0.6768112166846858 0.6682449635959467
0.8431707232534934 0.5599306887645872
0.7913441517555049 0.507265255698862


In [None]:
liberal_dataset = load_dataset("myradeng/cs-230-news-v3-test-liberal")
liberal_dataset = bert.read_eval_data(liberal_dataset, False)
filtered_liberal_dataset = filter_names(liberal_dataset)



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
liberal_probability_output = get_probability_output(filtered_liberal_dataset)
get_all_male_all_female_outputs(liberal_probability_output)

  0%|          | 0/1726 [00:00<?, ?it/s]

0.7845091701275503 0.6375186813164374


In [None]:
get_contexts(liberal_probability_output)

0.8345838661506623 0.5816055020200497
0.7948052838899538 0.7206351373526325
0.7516676359946035 0.860267813547948
0.6672586488576318 0.9003194058937045
0.7800478096665704 0.6876906272149547
0.7746794240971284 0.7343790190838246
