In [4]:
import sys
import contextlib
import transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import os
import sys
import torch
import math
import pandas as pd

In [5]:
torch.cuda.is_available()

True

## Research Questions
- Can we use GPT-2 to predict media effect between partisans? --> human language processing / writing assitive tool
- Can we infer the partisanship from the tweet? --> bayesian inference
- Can we quantify the level of mutual understanding between partisans? --> semantic similarity
- Can we extract partisan-specific opinions from fine-tuned GPT-2? --> opinion generation


## Set up
    we fine-tune two GPT-2 models, call them 
         - dem_model: fine-tuned on 10M tweets from Democrats
         - repub_model: fine-tuned on 4M tweets from Republicans

In [6]:
# dems_tokenizer = GPT2Tokenizer.from_pretrained('../models/gpt2_dems_full_data_single_follow/checkpoint-1367160/')
# dem_model = GPT2LMHeadModel.from_pretrained('../models/gpt2_dems_full_data_single_follow/checkpoint-1367160/')

# dems_tokenizer = GPT2Tokenizer.from_pretrained('../models/pretrained_gpt2_dems_4M_full_data_single_follow/checkpoint-489680/')
# dem_model = GPT2LMHeadModel.from_pretrained('../models/pretrained_gpt2_dems_4M_full_data_single_follow/checkpoint-489680/')

# repub_tokenizer = GPT2Tokenizer.from_pretrained('../models/gpt2_repubs_full_data_single_follow/checkpoint-490180/')
# repub_model = GPT2LMHeadModel.from_pretrained('../models/gpt2_repubs_full_data_single_follow/checkpoint-490180/')

dems_tokenizer = GPT2Tokenizer.from_pretrained('../train_lm/models/pretrained_gpt2_2019_dem/')
dem_model = GPT2LMHeadModel.from_pretrained('../train_lm/models/pretrained_gpt2_2019_dem')

repub_tokenizer = GPT2Tokenizer.from_pretrained('../train_lm/models/pretrained_gpt2_2019_repub/')
repub_model = GPT2LMHeadModel.from_pretrained('../train_lm/models/pretrained_gpt2_2019_repub/')

In [7]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')


In [8]:
# def tokenize_sentences(tokenizer, model):
#     # tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
#     # model.eval()
#     for sentence in sentences:
#         sentence = sentence.lower()
#         #tokenized_sentences.append(' '.join(tokenizer.tokenize(sentence)))
#         tokenized_sentences.append(' '.join([tokenizer.bos_token] + tokenizer.tokenize(sentence)))

#     for s in tokenized_sentences:
#         print(s)


# get surprisals
def score_sentence_token_surprisals(sentence, model, tokenizer):
    # sentence = sentence.lower()
    tokens = tokenizer.tokenize(sentence)
    input_ids = torch.tensor(tokenizer.encode(tokenizer.bos_token) + tokenizer.encode(sentence)).unsqueeze(0)
    prediction_scores = model(input_ids)[0]
    
    # print(prediction_scores.shape)

    scores = []
    for index in range(input_ids.shape[1]):

        scores.append(str(-1 * math.log2(torch.nn.Softmax(0)(prediction_scores[0][index-1])[input_ids[0][index]].item())))

    return(zip(tokens,scores[1:len(scores)]))

def show_word_surprisals(sentence, model, tokenizer, positions=None):
    x = list(score_sentence_token_surprisals(sentence, model, tokenizer))
    total_surprisal = 0
    surprisal_list = []
    for i in range(len(x)):
        if positions == None or i in positions:
            # print("\t".join(x[i]))
            surprisal_list.append(x[i])
            total_surprisal += float(x[i][1])
    # print("total surprisal", total_surprisal)
    # print()
    surprisal_list.append(["", total_surprisal])
    return surprisal_list


In [9]:
dict([(0, 0.5), (1, 0.6)])

{0: 0.5, 1: 0.6}

In [10]:
df = pd.read_csv("../data/prompts/extreme_manual_issues.txt", names=['Social Issues', 'Prompts'])
df.head()

Unnamed: 0,Social Issues,Prompts
0,Vaccine,COVID-19 vaccines are dangerous and contain ha...
1,Vaccine,Dr. Fauci is a lucrative scientist who wants t...
2,Trump,Most claims made by Donald Trump are false and...
3,Trump,Donald Trump is a supporter of racism and whit...
4,Trump,Trump has changed the Republican Party for the...


## Experiment 1: surprisal scores
    - are surprisal scores of target words significantly different for two models? 
    - TODO: statistical tests to disentangle the political effect on the target words
    - TODO: generate alternative sentences with A-Maze

In [11]:
df = pd.read_csv("../data/prompts/anes_pilot.txt", names=['Social Issues', 'Prompts'])


def run_single_prompt(prompt):
    print("====")
    print(prompt)
    # print("Democratic:")
    dem_sup_list = show_word_surprisals(prompt, dem_model, dems_tokenizer)
    rep_sup_list = show_word_surprisals(prompt, repub_model, repub_tokenizer)

    df_dem = pd.DataFrame(dem_sup_list, columns=['token', 'democrat_surprisal'])
    df_rep = pd.DataFrame(rep_sup_list, columns=['token', 'republican_surprisal'])[['republican_surprisal']]

    df_merge = pd.concat([df_dem, df_rep], axis=1)[['token', 'republican_surprisal', 'democrat_surprisal']]
    print(df_merge)
    print()

    
for prompt in df.Prompts:
    # prompt = "The Black Lives Matter protesters are extremely peaceful in most cities ."
    run_single_prompt(prompt)
    


====
Returning all unauthorized immigrants to their native countries.
            token republican_surprisal  democrat_surprisal
0          Return   18.789451715702405   18.27659225579704
1             ing    3.286054634746302   5.153398394384503
2            Ġall   12.304750677399142  14.039372446452775
3   Ġunauthorized    14.23625524622642  15.274388477700791
4     Ġimmigrants    4.566481922905242   4.451722160776913
5             Ġto   1.4136684792066356  2.3486156215037512
6          Ġtheir   1.9953395824046045   5.006285535864182
7         Ġnative    7.523945549858044  6.0921690804564745
8      Ġcountries   2.3570432433540414   2.083668566341426
9               .     9.44965163097566   9.801603071167136
10                           75.922643           82.527816

====
We need to return all unauthorized immigrants to their native countries.
            token republican_surprisal  democrat_surprisal
0              We    9.083263386776654   7.868435330367907
1           Ġneed    4.32

## Experiment 2: infer political partisanship from tweets
    - Given a corpus, we can add surprisal scores of all words to infer its political partisanship
    - SUM(surprisals) / N = perplexity
    - run two models on the corpus, the one with lower perplexity is the predicted partisan (bayesian inference)
    

In [40]:
def compute_sentence_perplexity(sentence, model, tokenizer, positions=None):
    x = list(score_sentence_token_surprisals(sentence, model, tokenizer))
    total_surprisal = 0
    surprisal_list = []
    for i in range(len(x)):
        if positions == None or i in positions:
            surprisal_list.append(x[i])
            total_surprisal += float(x[i][1])
    surprisal_list.append(["", total_surprisal])
    return total_surprisal / len(x)

def predict_partisanship(sentence):
    
    dem_score = -1 * math.log2(0.6) + compute_sentence_perplexity(sentence, dem_model, dems_tokenizer)
    repub_score = -1 * math.log2(0.2) + compute_sentence_perplexity(sentence, repub_model, repub_tokenizer)
    
    if dem_score < repub_score:
        return "democratic", dem_score, repub_score
    elif dem_score > repub_score:
        return "republican", dem_score, repub_score
    else:
        return "undecided", dem_score, repub_score

In [41]:
print(predict_partisanship("Abortion is the right of women"))
print(predict_partisanship("Abortion is a sin"))

('democratic', 6.43758195923955, 8.334764597504567)
('democratic', 6.994416336287896, 8.152029602709968)


In [42]:
print(predict_partisanship("Trump is the best"))
print(predict_partisanship("Biden is the best"))

('democratic', 5.421808223325425, 6.783055493974398)
('democratic', 5.127494430018432, 6.6606147594837335)


## Experiment 3: Local Phrase Embedding Distance
    - measure the local phrase semantic difference between a phrase and its comparator

In [8]:
# sentence_a = "<|endoftext|> There are many illegal immigrants in the US. <|endoftext|>"
# sentence_b = "<|endoftext|> There are many undocumented immigrants in the US. <|endoftext|>"

sentence_a = "<|endoftext|> illegal immigrants <|endoftext|>"
sentence_b = "<|endoftext|> undocumented immigrants <|endoftext|>"


In [9]:
dems_tokenizer

PreTrainedTokenizer(name_or_path='../train_lm/models/pretrained_gpt2_2019_dem/', vocab_size=50257, model_max_len=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'})

In [10]:
print(dems_tokenizer.tokenize(sentence_a))
print(dems_tokenizer.tokenize(sentence_b))

['<|endoftext|>', 'illegal', 'Ġimmigrants', '<|endoftext|>']
['<|endoftext|>', 'und', 'ocumented', 'Ġimmigrants', '<|endoftext|>']


In [65]:
def extract_phrase_embedding(model, tokenizer, phrase):
    model.eval()
    input_ids = tokenizer.encode(phrase)
    input_ids = torch.tensor(input_ids).unsqueeze(0)
    results =  model(input_ids)
    return results

cos = torch.nn.CosineSimilarity(dim=0, eps=1e-08)

In [66]:
extract_phrase_embedding(dem_model, dems_tokenizer, "sentence_a").logits

tensor([[[-1.1562e+01, -1.4179e+01, -7.5818e+00,  ..., -1.0923e+01,
          -1.2395e+01, -1.3954e+01],
         [-2.8969e+00, -7.6374e+00, -5.4957e-01,  ..., -4.3036e+00,
          -5.0997e+00, -1.0699e+01],
         [ 4.1435e+00,  7.5750e-03,  2.6001e+00,  ...,  1.2811e+00,
           1.8956e+00, -3.1047e+00],
         [ 1.4362e+01,  4.9364e+00,  1.8987e+01,  ...,  7.9719e+00,
           9.4430e+00,  4.8905e+00]]], grad_fn=<UnsafeViewBackward0>)

In [77]:
def compare_two_phrases(sentence_a, sentence_b):
    sentence_a = "<|endoftext|> we have " + sentence_a + " <|endoftext|>"
    sentence_b = "<|endoftext|> we have " + sentence_b + " <|endoftext|>"
    
    dem_a_embedding = extract_phrase_embedding(dem_model, dems_tokenizer, sentence_a).logits.squeeze(0)[-2]  # use the last token
    dem_b_embedding = extract_phrase_embedding(dem_model, dems_tokenizer, sentence_b).logits.squeeze(0)[-2]
    print("Similarity between the two phrases")
    print("A:", sentence_a)
    print("B:", sentence_b)
    print("Dem:", abs(cos(dem_a_embedding, dem_b_embedding).item()))

    rep_a_embedding = extract_phrase_embedding(repub_model, repub_tokenizer, sentence_a).logits.squeeze(0)[-2]  # use the last token
    rep_b_embedding = extract_phrase_embedding(repub_model, repub_tokenizer, sentence_b).logits.squeeze(0)[-2]

    print("Repub:", abs(cos(rep_a_embedding, rep_b_embedding).item()))
    
    pretrained_a_embedding = extract_phrase_embedding(gpt2_model, gpt2_tokenizer, sentence_a).logits.squeeze(0)[-2]  # use the last token
    pretrained_b_embedding = extract_phrase_embedding(gpt2_model, gpt2_tokenizer, sentence_b).logits.squeeze(0)[-2]
    
    print("Pretrained:", abs(cos(pretrained_a_embedding, pretrained_b_embedding).item()))
    



In [78]:
phrase_pairs = [
    ["undocumented workers", "illegal aliens"],
    ["estate tax", "death tax"],
    ["capitalism","free market"],
    ["foreign trade", "international trade"],
    ["public option", "government-run"],
    ["trickle-down", "cut taxes"],
    ["voodoo economics", "supply-side"],
    ["tax expenditures", "spending programs"],
    ["waterboarding", "interrogation"],
    ["socialized medicine", "single-payer"],
    ["political speech", "campaign spending"],
    ["star wars", "strategic defense initiative"],
    ["nuclear option", "constitutional option"]
]

for pair in phrase_pairs:
    print()
    compare_two_phrases(pair[0], pair[1])
    print()


Similarity between the two phrases
A: <|endoftext|> we have undocumented workers <|endoftext|>
B: <|endoftext|> we have illegal aliens <|endoftext|>
Dem: 0.984472930431366
Repub: 0.9796950817108154
Pretrained: 0.9999767541885376


Similarity between the two phrases
A: <|endoftext|> we have estate tax <|endoftext|>
B: <|endoftext|> we have death tax <|endoftext|>
Dem: 0.988421618938446
Repub: 0.9874023795127869
Pretrained: 0.9999802708625793


Similarity between the two phrases
A: <|endoftext|> we have capitalism <|endoftext|>
B: <|endoftext|> we have free market <|endoftext|>
Dem: 0.9737823009490967
Repub: 0.9740625619888306
Pretrained: 0.999916136264801


Similarity between the two phrases
A: <|endoftext|> we have foreign trade <|endoftext|>
B: <|endoftext|> we have international trade <|endoftext|>
Dem: 0.9951511025428772
Repub: 0.9947640895843506
Pretrained: 0.9999864101409912


Similarity between the two phrases
A: <|endoftext|> we have public option <|endoftext|>
B: <|endoftext|>

In [79]:
# compare_two_phrases()

In [15]:
# compare_two_phrases("Monday", "Tuesday")

In [16]:
compare_two_phrases("black people", "minority group")

Similarity between the two phrases
A: <|endoftext|> black people <|endoftext|>
B: <|endoftext|> minority group <|endoftext|>
Dem: 0.9661055207252502
Repub: 0.926490068435669


In [178]:
# compare_two_phrases("Black American", "African American")

In [179]:
# compare_two_phrases("Hispanic American", "Latino American")

In [17]:
compare_two_phrases("communism", "socialism")

Similarity between the two phrases
A: <|endoftext|> communism <|endoftext|>
B: <|endoftext|> socialism <|endoftext|>
Dem: 0.7550395727157593
Repub: 0.8931132555007935


In [18]:
compare_two_phrases("illegal aliens", "undocumented workers")

Similarity between the two phrases
A: <|endoftext|> illegal aliens <|endoftext|>
B: <|endoftext|> undocumented workers <|endoftext|>
Dem: 0.9582004547119141
Repub: 0.9856297373771667


In [19]:
compare_two_phrases("estate tax", "death tax")

Similarity between the two phrases
A: <|endoftext|> estate tax <|endoftext|>
B: <|endoftext|> death tax <|endoftext|>
Dem: 0.9430157542228699
Repub: 0.9681082963943481


In [20]:
compare_two_phrases("black people", "minority group")

Similarity between the two phrases
A: <|endoftext|> black people <|endoftext|>
B: <|endoftext|> minority group <|endoftext|>
Dem: 0.9661055207252502
Repub: 0.926490068435669


In [22]:
compare_two_phrases("affordable care act", "obama care")

Similarity between the two phrases
A: <|endoftext|> affordable care act <|endoftext|>
B: <|endoftext|> obama care <|endoftext|>
Dem: 0.9673094749450684
Repub: 0.9548484086990356


In [23]:
compare_two_phrases("i need water", "i need bread")

Similarity between the two phrases
A: <|endoftext|> i need water <|endoftext|>
B: <|endoftext|> i need bread <|endoftext|>
Dem: 0.9507384300231934
Repub: 0.9368794560432434


In [24]:
compare_two_phrases("all lives matter", "black lives do matter")

Similarity between the two phrases
A: <|endoftext|> all lives matter <|endoftext|>
B: <|endoftext|> black lives do matter <|endoftext|>
Dem: 0.8970802426338196
Repub: 0.9866625666618347


In [25]:
compare_two_phrases("all lives matter", "black and white lives both matter")

Similarity between the two phrases
A: <|endoftext|> all lives matter <|endoftext|>
B: <|endoftext|> black and white lives both matter <|endoftext|>
Dem: 0.17848418653011322
Repub: 0.48774904012680054


In [26]:
compare_two_phrases("muslims", "terrorists")

Similarity between the two phrases
A: <|endoftext|> muslims <|endoftext|>
B: <|endoftext|> terrorists <|endoftext|>
Dem: 0.9718367457389832
Repub: 0.9869581460952759


In [74]:
compare_two_phrases("gun control", "gun safety")


Similarity between the two phrases
A: <|endoftext|> gun control <|endoftext|>
B: <|endoftext|> gun safety <|endoftext|>
Dem: 0.9832996129989624
Repub: 0.9932688474655151


In [75]:
compare_two_phrases("liberal people", "radical people")


Similarity between the two phrases
A: <|endoftext|> liberal people <|endoftext|>
B: <|endoftext|> radical people <|endoftext|>
Dem: 0.710970938205719
Repub: 0.9705708622932434


In [76]:
compare_two_phrases("liberal people", "leftist people")


Similarity between the two phrases
A: <|endoftext|> liberal people <|endoftext|>
B: <|endoftext|> leftist people <|endoftext|>
Dem: 0.9099975228309631
Repub: 0.9827098846435547


In [77]:
compare_two_phrases("liberal people", "socialist people")

Similarity between the two phrases
A: <|endoftext|> liberal people <|endoftext|>
B: <|endoftext|> socialist people <|endoftext|>
Dem: 0.9500482082366943
Repub: 0.9805581569671631


In [78]:
# https://www.albertahealthservices.ca/assets/info/pf/div/if-pf-div-terms-and-phrases-to-avoid.pdf
compare_two_phrases("homosexual people", "gay people")

Similarity between the two phrases
A: <|endoftext|> homosexual people <|endoftext|>
B: <|endoftext|> gay people <|endoftext|>
Dem: 0.9419152140617371
Repub: 0.911022424697876


In [204]:
compare_two_phrases("affordable care act", "Obama care")

Similarity between the two phrases
A: <|endoftext|> affordable care act <|endoftext|>
B: <|endoftext|> Obama care <|endoftext|>
Dem: 0.9751651287078857
Repub: 0.944922149181366


## Experiment 4: opinion generation
    - generate opinions with each GPT-2, ~ 20 opinions
    - use sentiment analysis tool to quantify the attitude of the group towards:
        - social issues
        - in-group, out-group attitudes
        - political/public figures
    - check against gold stance of the group

Related works: 
1. [Mining Insights from Large Scale Corpora Using Fine Tuned Language Models](https://ebooks.iospress.nl/volumearticle/55101)
2. [Analyzing COVID-19 Tweets with Transformer-based Language Models](https://arxiv.org/abs/2104.10259)

In [77]:
from transformers import pipeline, set_seed

dem_generator = pipeline('text-generation', model='../models/pretrained_gpt2_dems_4M_full_data_single_follow/')
repub_generator = pipeline('text-generation', model='../models/gpt2_repubs_full_data_single_follow/')


In [78]:
dems_tokenizer

PreTrainedTokenizer(name_or_path='../models/pretrained_gpt2_dems_4M_full_data_single_follow/checkpoint-489680/', vocab_size=50257, model_max_len=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'})

In [79]:
set_seed(42)

# https://huggingface.co/blog/how-to-generate

def generate_with_a_prompt(prompt, dem_gen, repub_gen):
    # dem gen
    print("Democratic GPT-2:")
    results = dem_gen(prompt, max_length=20, temperature=0.4, num_return_sequences=5, pad_token_id=50256, clean_up_tokenization_spaces=True)
    for res in results:
        print(res['generated_text'].replace("\n", " "))
    
    # repub gen
    print("========\n\n")
    print("Republican GPT-2:")
    results = repub_gen(prompt, max_length=20, temperature=0.4, num_return_sequences=5, pad_token_id=50256, clean_up_tokenization_spaces=True)
    for res in results:
        print(res['generated_text'].replace("\n", " "))


In [261]:
# dems_tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [258]:
# pipeline("test")

In [7]:
def extract_topk_per_token(prompt, model, tokenizer):
    
    input_ids = torch.tensor(tokenizer.encode(tokenizer.bos_token) + 
                             tokenizer.encode(prompt)).unsqueeze(0)
    input_length = len(input_ids[0])
    outputs = model(input_ids)
    
    all_list = []
    target_tokens = []
    for i in range(input_length-1):
        
        token_probs_list = []
        
        target_token_id = input_ids[0][i+1]
        target_token = tokenizer.convert_ids_to_tokens(target_token_id.item())
        # print(target_token)
        
        each_prob_dist = torch.nn.Softmax(0)(outputs[0].squeeze()[i])
        # print(each_prob_dist.shape, each_prob_dist.sum())
        topk = torch.topk(each_prob_dist, 6)

        topk_probas = topk.values.detach().tolist()
        topk_words = tokenizer.convert_ids_to_tokens(topk.indices)
        topk_words = [each.replace("Ġ", "") for each in topk_words]
        token_probs_list.extend(list(zip(topk_words, topk_probas)))
        if target_token not in set(topk_words):
            token_probs_list.append((target_token, each_prob_dist[target_token_id].item()))
#         # merge common words
#         word2proba = {}
#         for word, proba in token_probs_list:
#             if word in word2proba:
#                 word2proba[word] += proba
#             else:
#                 word2proba[word] = proba
#         token_probs_list = sorted(list(word2proba.items()), key=lambda x: -x[1])
        all_list.append(token_probs_list)
        target_tokens.append(target_token)
    return target_tokens, all_list

target_tokens, all_list = extract_topk_per_token("Dr. Anthony Fauci is a hero .", dem_model, dems_tokenizer)

for target_token, each in zip(target_tokens, all_list):
    print(target_token, each)

Dr [('I', 0.08102313429117203), ('@', 0.07565118372440338), (',', 0.04771842807531357), ('to', 0.036704517900943756), ('.', 0.02933499589562416), ('@', 0.021787913516163826), ('Dr', 0.000271661119768396)]
. [('inking', 0.13614609837532043), ('ink', 0.13268722593784332), ('unk', 0.06880466639995575), ('ump', 0.06106007471680641), ('inks', 0.06021980941295624), ('one', 0.0600702166557312), ('.', 3.738446321222e-05)]
ĠAnthony [('Jack', 0.02079404518008232), ('Kelly', 0.019364500418305397), ('M', 0.017932597547769547), ('G', 0.01788846217095852), ('Joe', 0.01640082150697708), ('Mart', 0.0161176435649395), ('ĠAnthony', 4.877291303273523e-06)]
ĠF [('F', 0.7397092580795288), ('Evans', 0.009879098273813725), ('Hopkins', 0.008779113180935383), ('Stro', 0.007220400031656027), ('S', 0.006825625896453857), ('M', 0.005388324148952961), ('ĠF', 0.7397092580795288)]
au [('au', 0.9944012761116028), ('.', 0.0009878984419628978), ('oy', 0.0002599698200356215), ('inc', 0.0002558085834607482), ('enn', 0.00

In [60]:
example_sentence = "Labor unions are the most"

target_tokens, all_list = extract_topk_per_token(example_sentence, gpt2_model, gpt2_tokenizer)

alternative_tops = all_list[-1]
target_token = target_token[-1]
for word, prob in alternative_tops:
    print(word, "{0:.2%}".format(prob))
    

most 13.51%
biggest 7.10%
largest 7.07%
only 5.90%
ones 5.64%
backbone 3.99%
Ġmost 13.51%


In [62]:
target_tokens, all_list = extract_topk_per_token(example_sentence, repub_model, repub_tokenizer)

alternative_tops = all_list[-1]
target_token = target_token[-1]
for word, prob in alternative_tops:
    print(word, "{0:.2%}".format(prob))
    

most 10.67%
largest 7.72%
worst 5.02%
backbone 4.90%
ones 4.68%
biggest 4.41%
Ġmost 10.67%


In [61]:
target_tokens, all_list = extract_topk_per_token(example_sentence, dem_model, dems_tokenizer)

alternative_tops = all_list[-1]
target_token = target_token[-1]
for word, prob in alternative_tops:
    print(word, "{0:.2%}".format(prob))
    

backbone 22.37%
only 10.91%
most 6.98%
largest 4.49%
best 3.60%
ones 1.98%
Ġmost 6.98%


In [255]:
# dem_generator = pipeline('text-generation', model='gpt2', device=-1)

In [81]:
generate_with_a_prompt("Donald Trump is a", dem_generator, repub_generator)

Democratic GPT-2:
Donald Trump is a racist, misogynistic, xenophobic, homophobic, and misogynist. He '
Donald Trump is a loser. A loser who lies and cheats. A loser who lies and che
Donald Trump is a liar, a cheat, a fraud, a liar, a cheat, a liar
Donald Trump is a horrible person. He's a liar, a cheat, a crook,
Donald Trump is a liar. He's lying. He's lying. He's lying


Republican GPT-2:
Donald Trump is a winner. He is fighting for us. @USER @USER @USER @
Donald Trump is a racist and a fascist. "@USER @USER @USER @USER @
Donald Trump is a racist. "@USER @USER @USER @USER @USER @USER
Donald Trump is a great American and a great President! @USER @USER @USER @USER
Donald Trump is a liar. He is a liar. "@USER @USER @USER @


In [198]:
generate_with_a_prompt("Dr. Fauci is a", dem_generator, repub_generator)

Democratic GPT-2:
Dr. Fauci is a hero. He saved lives. He deserves a medal. "
Dr. Fauci is a hero. He's a scientist. He's a leader
Dr. Fauci is a hero. He is a leader in the fight against the virus.
Dr. Fauci is a hero. He's a true public servant. He's
Dr. Fauci is a hero. He's been there before. He's done


Republican GPT-2:
Dr. Fauci is a liar, a fraud and a fraud. HTTPURL" @
Dr. Fauci is a liar, a fraud, and a fraud. He's a
Dr. Fauci is a fraud and a liar. "@USER @USER @USER
Dr. Fauci is a fraud and a liar. @USER @USER @USER @
Dr. Fauci is a liar and a fraud. @USER @USER @USER @


In [199]:
generate_with_a_prompt("The pandemic was caused by", dem_generator, repub_generator)

Democratic GPT-2:
The pandemic was caused by the government's incompetence, not by the people who work for it
The pandemic was caused by a lack of leadership, not by a lack of money. The pand
The pandemic was caused by the government not being able to do anything about the virus. "
The pandemic was caused by the government's failure to protect its citizens from the worst of the
The pandemic was caused by the Trump administration's refusal to take action against the Chinese government.


Republican GPT-2:
The pandemic was caused by the government's overreach, and it's not going away
The pandemic was caused by the media's fear of the virus. HTTPURL @USER
The pandemic was caused by the government's failure to secure the borders. HTTPURL @
The pandemic was caused by the government's unwillingness to protect the people. @USER @
The pandemic was caused by the government's failure to act quickly and decisively. HTTPURL 


In [200]:
generate_with_a_prompt("For the pandemic, we blame", dem_generator, repub_generator)

Democratic GPT-2:
For the pandemic, we blame the #TrumpVirus on the #GOP and #Republicans who
For the pandemic, we blame @USER for not being able to provide #COVID19 testing
For the pandemic, we blame the state and local governments for the lack of funding to provide the
For the pandemic, we blame our own failure to control the virus and the failure of our leaders
For the pandemic, we blame the people who are in charge. We blame the people who are


Republican GPT-2:
For the pandemic, we blame the government, not the people. HTTPURL" "@
For the pandemic, we blame the government for the problems we face. We blame the government for
For the pandemic, we blame the media, the teachers union, and the politicians. But we
For the pandemic, we blame the media for the problems in our country. They are the ones
For the pandemic, we blame the coronavirus on the people who haven't been vaccinated


In [99]:
generate_with_a_prompt("During the pandemic, Asians have been feeling", dem_generator, repub_generator)

Democratic GPT-2:
During the pandemic, Asians have been feeling forced to work less. Learn more about the critical need
During the pandemic, Asians have been feeling less xenophobic than many other Americans. But the virus
During the pandemic, Asians have been feeling extra targeted. HTTPURL" @USER I '
During the pandemic, Asians have been feeling the brunt of anti-Asian violence, and it has
During the pandemic, Asians have been feeling ‘ underrepresented'in education HTTPURL" 


Republican GPT-2:
During the pandemic, Asians have been feeling the pinch more frequently than the other way around, according
During the pandemic, Asians have been feeling helpless and depressed. But now there's an opportunity
During the pandemic, Asians have been feeling left out on the list. Learn how your organization can
During the pandemic, Asians have been feeling isolation from the rest of the U. S. population
During the pandemic, Asians have been feeling particularly alienated as leaders have sought t

In [51]:
generate_with_a_prompt("During the pandemic, Black folks have been feeling", dem_generator, repub_generator)

Democratic GPT-2:
During the pandemic, Black folks have been feeling their own pain, and that impacts the entire US
During the pandemic, Black folks have been feeling invisible and unseen. Our health is most important to
During the pandemic, Black folks have been feeling overlooked due to lack of resources, so @USER
During the pandemic, Black folks have been feeling overlooked and underrepresented because they have no job.
During the pandemic, Black folks have been feeling invisible, as have their communities, and the effects


Republican GPT-2:
During the pandemic, Black folks have been feeling trapped in urban communities. They are now struggling to
During the pandemic, Black folks have been feeling the impact of an online shopping spree and the need
During the pandemic, Black folks have been feeling threatened at restaurants, shops, gyms & grocery
During the pandemic, Black folks have been feeling the brunt of social justice activism and victimization :
During the pandemic, Black fo

In [100]:
generate_with_a_prompt("The police brutality", 
                       dem_generator, repub_generator)

Democratic GPT-2:
The police brutality is what people talk about and is why the police are on the wrong side when it
The police brutality shouldn't exist. We can't reform this. Please do something or just vote
The police brutality that occurred yesterday wasn't the same as the police brutality that occured on the
The police brutality that took place in front of him yesterday is truly disgusting and disgusting. #BLM
The police brutality / murder / war on peaceful protestors is never going to stop. #BlackLives


Republican GPT-2:
The police brutality narrative was always meant to get rid of the police. Now it's a big
The police brutality narrative is a farce. The only way to stop it is to prosecute the criminals
The police brutality of the BLM / ANTIFA Riots, is not a “ systemic threat
The police brutality narrative is not about police brutality, because that's the narrative" @USER
The police brutality? What is BLM doing to protect their lives, you fucking clown" "@


In [101]:
generate_with_a_prompt("Abortion is", 
                       dem_generator, repub_generator)

Democratic GPT-2:
Abortion is not a legal right. Not even close to as well as it should be. #
Abortion is the law of the land. Why would someone with the power to decide what happens to
Abortion is only “ if it is illegal. ” If pregnancy and abortion aren't
Abortion is #Life and #Choice : HTTPURL HTTPURL You might just want to start
Abortion isn't a constitutional right but like, you can have a vagina in your choice without


Republican GPT-2:
Abortion is not okay. I'm proud of our military. "It's a
Abortion is the most horrific thing he does but I'm certain he never has to apologize for
Abortion is barbaric, and the #AbortionHealing is worse. #EndAbortionNow
Abortion is a disgusting act, even though it's actually pro-life. It's
Abortion is no excuse. The majority of abortions is a horrific tragedy. So I think that '


In [102]:
generate_with_a_prompt("Joe Biden is", 
                       dem_generator, repub_generator)

Democratic GPT-2:
Joe Biden is at it again. He's making sure that a majority can vote for something instead
Joe Biden is the only one who hasn't publicly denounced any of the terrorists, including #Tra
Joe Biden is your president. But you gotta get him the fuck outta here ASAP and I hope
Joe Biden is the first American president to win the popular vote twice in history. ” A must
Joe Biden is the President-elect and is in control of the Senate. Do something. "


Republican GPT-2:
Joe Biden is a horrible person. The man is a criminal and has ties to the Ukraine, China
Joe Biden is in a world of his own. He wants to lock in to what happened at the
Joe Biden is the definition of the radical left ”... @USER What's good to
Joe Biden is too old to remember his name. @USER @USER @USER @USER @
Joe Biden is the president-elect at that point. "@USER It wouldn't matter


In [103]:
generate_with_a_prompt("The Black Lives Matter protesters are", 
                       dem_generator, repub_generator)

Democratic GPT-2:
The Black Lives Matter protesters aren't just anti-racist, they're a unionized movement
The Black Lives Matter protesters are protesting police brutality and using that force to control them. #blm
The Black Lives Matter protesters are at the intersection of the 5 and I - 5 in #Minneapolis
The Black Lives Matter protesters are standing in solidarity with the local sheriff's offices. #BLM
The Black Lives Matter protesters are about not being murdered or being beaten. #BLM HTTPURL 


Republican GPT-2:
The Black Lives Matter protesters are a domestic terrorist organization. The left's anti-police rhetoric is
The Black Lives Matter protesters are the only ones who have a clue about what's going on.
The Black Lives Matter protesters are violent, looting, and breaking through barriers of a federal courthouse in Oregon
The Black Lives Matter protesters are not being harassed by BLM rioters? :thinking_face: 
The Black Lives Matter protesters are'not being represented'in the US

## Experiment 5: Writing Assitive Tool 

[Huggingface GPT-2 Demo](https://transformer.huggingface.co/doc/gpt2-large)

In [151]:
book = "How to Talk to a Science Denier".lower()
book

'how to talk to a science denier'

In [152]:
run_single_prompt(book)

====
how to talk to a science denier
      token  republican_surprisal    democrat_surprisal
0       how    15.580385013795645     17.03848353716829
1       Ġto     5.289046675146581     7.217765604016828
2     Ġtalk     9.770761787872772     8.448259825352013
3       Ġto    1.7981924019495104    2.2361055446201403
4        Ġa    2.3296321467084917     4.377495907388955
5  Ġscience    12.281267302298826     11.70924018936432
6      Ġden     4.253549543299569      5.64415561039224
7       ier  0.014942088113823913  0.020963625417505335
8                       51.317777              56.69247

