In [1]:
import pickle
import transformers
import torch
import pandas as pd
from alibi_detect.cd import LSDDDrift, MMDDrift

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('GPU available')
else:
    device = torch.device('cpu')
    print('Uh oh, GPU unavailable')

  from .autonotebook import tqdm as notebook_tqdm


GPU available


In [2]:
def assemble_data() -> pd.DataFrame:
    biden_first: list[tuple[any, str]] = pickle.load(open("biden_first_half.pickle", 'rb'))
    biden_last = pickle.load(open("biden_last_half.pickle", 'rb'))
    biden_first.extend(biden_last)
    return pd.DataFrame(biden_first, columns=["datetime", "tweet"])

biden_df = assemble_data()

In [3]:
biden_df

Unnamed: 0,datetime,tweet
0,2020-10-15 00:00:20,@IslandGirlPRV @BradBeauregardJ @MeidasTouch T...
1,2020-10-15 00:00:21,@chrislongview Watching and setting dvr. Lets ...
2,2020-10-15 00:00:22,#censorship #HunterBiden #Biden #BidenEmails #...
3,2020-10-15 00:00:23,"""IS THIS WRONG??!!"" Cory Booker's BRILLIANT Fi..."
4,2020-10-15 00:00:25,"In 2020, #NYPost is being #censorship #CENSORE..."
...,...,...
521116,2020-11-08 23:59:16,"Mr. #Biden, tear down that wall (with #Mexico)..."
521117,2020-11-08 23:59:32,NYT: #BeratAlbayraks departure may also signal...
521118,2020-11-08 23:59:33,@staceyabrams Thank you for all your support a...
521119,2020-11-08 23:59:34,@elnuevoherald LOS MEDIOS A LA FUERZA QUIEREN ...


In [4]:
post_debate_date = "2020-10-23"
post_election_date = "2020-11-04"
biden_pre_debate = biden_df[biden_df["datetime"] < post_debate_date]
biden_post_debate = biden_df[(biden_df["datetime"] >= post_debate_date) & (biden_df["datetime"] < post_election_date)]
biden_post_election = biden_df[biden_df["datetime"] >= post_election_date]

In [5]:
model_checkpoint = "bert-base-multilingual-cased"
tokenizer = transformers.BertTokenizer.from_pretrained(model_checkpoint)
model = transformers.BertModel.from_pretrained(model_checkpoint)



In [10]:
n_epochs = 1
n = 1000

model.train()
model.to(device)
with torch.no_grad():
    for i in range(n_epochs):
        pre_debate_null = tokenizer(biden_pre_debate["tweet"].sample(n).tolist(), padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False).to(device)
        pre_debate_null_embeddings = model(**pre_debate_null).pooler_output.cpu().numpy()
        del pre_debate_null

        pre_debate = tokenizer(biden_pre_debate["tweet"].sample(n).tolist(), padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False).to(device)
        pre_debate_embeddings = model(**pre_debate).pooler_output.cpu().numpy()
        del pre_debate

        post_debate = tokenizer(biden_post_debate["tweet"].sample(n).tolist(), padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False).to(device)
        post_debate_embeddings = model(**post_debate).pooler_output.cpu().numpy()
        del post_debate

        post_election = tokenizer(biden_post_election["tweet"].sample(n).tolist(), padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False).to(device)
        post_election_embeddings = model(**post_election).pooler_output.cpu().numpy()
        del post_election


        mmddrift = MMDDrift(x_ref=pre_debate_null_embeddings, backend="pytorch", p_val=.05)
        lsdddrift = LSDDDrift(x_ref=pre_debate_null_embeddings, backend="pytorch", p_val=.05)

        print(mmddrift.predict(pre_debate_null_embeddings))
        print(mmddrift.predict(pre_debate_embeddings))
        print(mmddrift.predict(post_debate_embeddings))
        print(mmddrift.predict(post_election_embeddings))

        print(lsdddrift.predict(pre_debate_null_embeddings))
        print(lsdddrift.predict(pre_debate_embeddings))
        print(lsdddrift.predict(post_debate_embeddings))
        print(lsdddrift.predict(post_election_embeddings))

        del pre_debate_null_embeddings, pre_debate_embeddings, post_debate_embeddings, post_election_embeddings
        torch.cuda.empty_cache()



    

{'data': {'is_drift': 0, 'distance': -0.001323401927947998, 'p_val': 1.0, 'threshold': 0.05, 'distance_threshold': array(0.00080585, dtype=float32)}, 'meta': {'name': 'MMDDriftTorch', 'online': False, 'data_type': None, 'version': '0.12.0', 'detector_type': 'drift', 'backend': 'pytorch'}}
{'data': {'is_drift': 0, 'distance': -0.0004379153251647949, 'p_val': 0.8899999856948853, 'threshold': 0.05, 'distance_threshold': array(0.00095004, dtype=float32)}, 'meta': {'name': 'MMDDriftTorch', 'online': False, 'data_type': None, 'version': '0.12.0', 'detector_type': 'drift', 'backend': 'pytorch'}}
{'data': {'is_drift': 1, 'distance': 0.0008519887924194336, 'p_val': 0.03999999910593033, 'threshold': 0.05, 'distance_threshold': array(0.00075525, dtype=float32)}, 'meta': {'name': 'MMDDriftTorch', 'online': False, 'data_type': None, 'version': '0.12.0', 'detector_type': 'drift', 'backend': 'pytorch'}}
{'data': {'is_drift': 1, 'distance': 0.002372264862060547, 'p_val': 0.0, 'threshold': 0.05, 'dista