In [1]:
import pickle
import transformers
import torch
import pandas as pd

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('GPU available')
else:
    device = torch.device('cpu')
    print('Uh oh, GPU unavailable')

  from .autonotebook import tqdm as notebook_tqdm


GPU available


In [2]:
def assemble_data() -> pd.DataFrame:
    biden_first: list[tuple[any, str]] = pickle.load(open("biden_first_half.pickle", 'rb'))
    biden_last = pickle.load(open("biden_last_half.pickle", 'rb'))
    biden_first.extend(biden_last)
    return pd.DataFrame(biden_first, columns=["datetime", "tweet"])

biden_df = assemble_data()

In [23]:
biden_df

Unnamed: 0,datetime,tweet
0,2020-10-15 00:00:20,@IslandGirlPRV @BradBeauregardJ @MeidasTouch T...
1,2020-10-15 00:00:21,@chrislongview Watching and setting dvr. Lets ...
2,2020-10-15 00:00:22,#censorship #HunterBiden #Biden #BidenEmails #...
3,2020-10-15 00:00:23,"""IS THIS WRONG??!!"" Cory Booker's BRILLIANT Fi..."
4,2020-10-15 00:00:25,"In 2020, #NYPost is being #censorship #CENSORE..."
...,...,...
521116,2020-11-08 23:59:16,"Mr. #Biden, tear down that wall (with #Mexico)..."
521117,2020-11-08 23:59:32,NYT: #BeratAlbayraks departure may also signal...
521118,2020-11-08 23:59:33,@staceyabrams Thank you for all your support a...
521119,2020-11-08 23:59:34,@elnuevoherald LOS MEDIOS A LA FUERZA QUIEREN ...


In [24]:
post_debate_date = "2020-10-23"
post_election_date = "2020-11-04"
biden_pre_debate = biden_df[biden_df["datetime"] < post_debate_date]
biden_post_debate = biden_df[(biden_df["datetime"] >= post_debate_date) & (biden_df["datetime"] < post_election_date)]
biden_post_election = biden_df[biden_df["datetime"] > post_election_date]

In [25]:
model_checkpoint = "bert-base-multilingual-cased"
tokenizer = transformers.BertTokenizer.from_pretrained(model_checkpoint)
model = transformers.BertModel.from_pretrained(model_checkpoint)



In [26]:
subset = biden_pre_debate["tweet"].sample(100)

In [28]:
subset.tolist()

["#Biden corruption kickbacks scandal seems like small change for our Chinese Gandees. All political dynasties including regional ones treat such kickbacks &amp; privilege of father's office as Standard Operating Procedure https://t.co/N0cBZssQ6m",
 'Tens of thousands of murderers at the Womens March literally yesterday and @JoeBiden doesnt condemn them. Only Trump. #Biden is such a liar and an embarrassment to honesty in #America. #BidenCrimeFamily #Ukraine #Russia #HunterBiden https://t.co/V9uuWm2HGs https://t.co/P5aizttcUz',
 "#Biden is not runn'n for president. Kammy is. If yoo think HE is...Look at this beautifool oceanfront property for sale! https://t.co/L3K0sbDotI",
 '@JesusOfTheWest All good man. #Biden is corrupt and showing signs of being senile. Trump at least made his money on the private side. Vote for who you like. Ill vote for who I know is the better candidate. No response needed.',
 "@bennyjohnson Did you hear about VP J. #Biden giving information to the CCP then afte

In [29]:
tokens = tokenizer(subset.tolist(), padding=True, truncation=True, return_tensors="pt")
tokens

{'input_ids': tensor([[  101,   108, 31156,  ...,     0,     0,     0],
        [  101, 16411, 10107,  ...,     0,     0,     0],
        [  101,   108, 31156,  ...,     0,     0,     0],
        ...,
        [  101,   108, 13062,  ...,     0,     0,     0],
        [  101, 45896, 10105,  ...,     0,     0,     0],
        [  101,   108, 30776,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [31]:
outputs = model(**tokens)
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0218, -0.1480, -0.1161,  ...,  0.1724,  0.0806,  0.0090],
         [-0.6128, -0.2928, -0.6081,  ...,  0.4386,  0.2196, -0.0694],
         [-0.1392, -0.2283, -0.4664,  ..., -0.1014, -0.2945,  0.0442],
         ...,
         [ 0.2303, -0.0581,  0.4358,  ...,  0.2609, -0.2466, -0.0886],
         [-0.2509, -0.3381,  0.3361,  ..., -0.0451, -0.2577, -0.1130],
         [-0.1320,  0.0829,  0.6726,  ...,  0.3365, -0.0341, -0.0809]],

        [[-0.2471, -0.0867, -0.3470,  ...,  0.2191,  0.2480, -0.1187],
         [ 0.0937,  0.1482,  0.4173,  ..., -0.1029,  0.5192,  0.0935],
         [ 0.6530, -0.1282,  0.4261,  ...,  0.1304,  0.1014,  0.2731],
         ...,
         [-0.1069, -0.4241,  0.0896,  ...,  0.6869,  0.3223, -0.6714],
         [ 0.1022, -0.3057,  0.3623,  ...,  0.3992,  0.2913, -0.3916],
         [ 0.2204, -0.4331,  0.1887,  ...,  0.4529,  0.3797, -0.6594]],

        [[-0.2304, -0.2367, -0.3354,  ...,  0.1332,  

In [34]:
embeddings = outputs.last_hidden_state[:, 0, :]