In [1]:
import pickle
import transformers
import torch
import pandas as pd
from alibi_detect.cd import LSDDDrift, MMDDrift

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('GPU available')
else:
    device = torch.device('cpu')
    print('Uh oh, GPU unavailable')

  from .autonotebook import tqdm as notebook_tqdm


GPU available


In [2]:
def assemble_data() -> pd.DataFrame:
    biden_first: list[tuple[any, str]] = pickle.load(open("biden_first_half.pickle", 'rb'))
    biden_last = pickle.load(open("biden_last_half.pickle", 'rb'))
    biden_first.extend(biden_last)
    return pd.DataFrame(biden_first, columns=["datetime", "tweet"])

biden_df = assemble_data()

In [3]:
biden_df

Unnamed: 0,datetime,tweet
0,2020-10-15 00:00:20,@IslandGirlPRV @BradBeauregardJ @MeidasTouch T...
1,2020-10-15 00:00:21,@chrislongview Watching and setting dvr. Lets ...
2,2020-10-15 00:00:22,#censorship #HunterBiden #Biden #BidenEmails #...
3,2020-10-15 00:00:23,"""IS THIS WRONG??!!"" Cory Booker's BRILLIANT Fi..."
4,2020-10-15 00:00:25,"In 2020, #NYPost is being #censorship #CENSORE..."
...,...,...
521116,2020-11-08 23:59:16,"Mr. #Biden, tear down that wall (with #Mexico)..."
521117,2020-11-08 23:59:32,NYT: #BeratAlbayraks departure may also signal...
521118,2020-11-08 23:59:33,@staceyabrams Thank you for all your support a...
521119,2020-11-08 23:59:34,@elnuevoherald LOS MEDIOS A LA FUERZA QUIEREN ...


In [4]:
post_debate_date = "2020-10-23"
post_election_date = "2020-11-04"
biden_pre_debate = biden_df[biden_df["datetime"] < post_debate_date]
biden_post_debate = biden_df[(biden_df["datetime"] >= post_debate_date) & (biden_df["datetime"] < post_election_date)]
biden_post_election = biden_df[biden_df["datetime"] >= post_election_date]

In [5]:
model_checkpoint = "bert-base-multilingual-cased"
tokenizer = transformers.BertTokenizer.from_pretrained(model_checkpoint)
model = transformers.BertModel.from_pretrained(model_checkpoint)



In [27]:
n=1000
model.train()
model.to(device)
pre_debate_null = tokenizer(biden_pre_debate["tweet"].sample(n).tolist(), padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False).to(device)

In [28]:
pre_debate_null['input_ids']

tensor([[   101,    108,  40315,  ...,      0,      0,      0],
        [   101,    137,  14309,  ...,      0,      0,      0],
        [   101,  46361,  66058,  ...,      0,      0,      0],
        ...,
        [   101,    137, 103306,  ...,      0,      0,      0],
        [   101,    137,  10734,  ...,      0,      0,      0],
        [   101,    137,  13486,  ...,      0,      0,      0]],
       device='cuda:0')

In [39]:
with torch.no_grad():
    pre_debate_null_embeddings = model.forward(**pre_debate_null, output_hidden_states=False)#.last_hidden_state[:, 0, :].cpu().detach().numpy()

In [31]:
print(pre_debate_null_embeddings.pooler_output.shape)
pre_debate_null_embeddings.pooler_output

torch.Size([1000, 768])


tensor([[ 0.3055, -0.0568,  0.2168,  ..., -0.2143, -0.0010,  0.1692],
        [ 0.1602, -0.1356,  0.3190,  ..., -0.2114,  0.1897,  0.1591],
        [ 0.1960, -0.1599,  0.2846,  ..., -0.0137,  0.2131,  0.1596],
        ...,
        [ 0.0953, -0.2176,  0.4431,  ..., -0.3339,  0.3082,  0.1643],
        [ 0.4526, -0.1542,  0.3967,  ..., -0.4728,  0.2763,  0.2856],
        [ 0.1688, -0.0469,  0.2878,  ..., -0.2344,  0.1260,  0.0599]],
       device='cuda:0')

In [32]:
print(pre_debate_null_embeddings.last_hidden_state[:, 0, :].shape)
pre_debate_null_embeddings.last_hidden_state[:, 0, :]

torch.Size([1000, 768])


tensor([[ 0.0221, -0.0602, -0.3180,  ...,  0.2246,  0.2059, -0.3551],
        [ 0.0234, -0.1797, -0.0762,  ...,  0.0375,  0.1536, -0.1373],
        [-0.2330, -0.3120, -0.0973,  ...,  0.2119,  0.1197, -0.2244],
        ...,
        [-0.1456, -0.0919, -0.3073,  ...,  0.1412,  0.1634, -0.1895],
        [-0.2168,  0.1021, -0.1197,  ...,  0.5530,  0.1545, -0.0289],
        [ 0.0633, -0.4306, -0.0654,  ...,  0.2322, -0.1206, -0.0192]],
       device='cuda:0')

In [41]:
del pre_debate_null_embeddings
print(torch.cuda.memory_allocated())

3536910336


In [6]:
n_epochs = 1
n = 1000

model.train()
model.to(device)
with torch.no_grad():
for i in range(n_epochs):
    pre_debate_null = tokenizer(biden_pre_debate["tweet"].sample(n).tolist(), padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False).to(device)
    print(torch.cuda.memory_allocated())
    #pre_debate_null_embeddings = model(**pre_debate_null).last_hidden_state[:, 0, :].cpu().detach().numpy()
    del pre_debate_null
    
    pre_debate = tokenizer(biden_pre_debate["tweet"].sample(n).tolist(), padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False).to(device)
    pre_debate_embeddings = model(**pre_debate).last_hidden_state[:, 0, :].cpu().detach().numpy()
    del pre_debate
    
    post_debate = tokenizer(biden_post_debate["tweet"].sample(n).tolist(), padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False).to(device)
    post_debate_embeddings = model(**post_debate).last_hidden_state[:, 0, :].cpu().detach().numpy()
    del post_debate
    
    post_election = tokenizer(biden_post_election["tweet"].sample(n).tolist(), padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False).to(device)
    post_election_embeddings = model(**post_election).last_hidden_state[:, 0, :].cpu().detach().numpy()
    del post_election

    
    torch.cuda.empty_cache()

    mmddrift = MMDDrift(x_ref=pre_debate_null_embeddings, backend="pytorch", p_val=.05)
    lsdddrift = LSDDDrift(x_ref=pre_debate_null_embeddings, backend="pytorch", p_val=.05)

    #mmddrift.predict(pre_debate_null_embeddings)
    mmddrift.predict(pre_debate_embeddings)
    mmddrift.predict(post_debate_embeddings)
    mmddrift.predict(post_election_embeddings)

    #lsdddrift.predict(pre_debate_null_embeddings)
    lsdddrift.predict(pre_debate_embeddings)
    lsdddrift.predict(post_debate_embeddings)
    lsdddrift.predict(post_election_embeddings)

    del pre_debate_null_embeddings, pre_debate_embeddings, post_debate_embeddings, post_election_embeddings
    torch.cuda.empty_cache()



    

KeyboardInterrupt: 