In [1]:
import pandas as pd
import sys
import os
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "src")))

from preprocessing import TweetFeatureExtractor, TweetProcessor

DATA_PATH = "../data"

In [9]:
train = pd.read_json(DATA_PATH + "/Twibot20/tweets/train.json")
y = train["label"]
X = train.drop(columns=["label"])

In [10]:
train.head()

Unnamed: 0,ID,profile,tweet,neighbor,domain,label
0,17461978,"{'id': '17461978 ', 'id_str': '17461978 ', 'na...",[RT @CarnivalCruise: 🎉 Are you ready to see wh...,,"[Politics, Business, Entertainment]",0
1,1297437077403885568,"{'id': '1297437077403885568 ', 'id_str': '1297...",,"{'following': ['170861207', '23970102', '47293...",[Politics],1
2,17685258,"{'id': '17685258 ', 'id_str': '17685258 ', 'na...",[RT @realDonaldTrump: THANK YOU #RNC2020! http...,"{'following': ['46464108', '21536398', '186434...","[Politics, Entertainment, Sports]",0
3,15750898,"{'id': '15750898 ', 'id_str': '15750898 ', 'na...",[A family fears they may have been cheated out...,"{'following': ['2324715174', '24030137', '2336...",[Politics],0
4,1659167666,"{'id': '1659167666 ', 'id_str': '1659167666 ',...",[RT @VonteThePlug: Yeah but he ain’t got one h...,"{'following': ['1628313708', '726405625', '130...",[Politics],1


In [11]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(
    steps=[
        ("feature_extractor", TweetFeatureExtractor()),
        ("tweet_processor", TweetProcessor()),
    ]
)

In [13]:
X_transformed = pipeline.transform(X=X)

In [14]:
X_transformed.head()

Unnamed: 0,ID,profile,tweet,neighbor,domain,tweets_joined,avg_word_count,avg_character_count,avg_hashtag_count,avg_mention_count,avg_link_count,avg_emoji_count,avg_positive_word_count,avg_negative_word_count
0,17461978,"{'id': '17461978 ', 'id_str': '17461978 ', 'na...",[RT @CarnivalCruise: 🎉 Are you ready to see wh...,,"[Politics, Business, Entertainment]",rt USER EMOJI ready see newest ship’s name wil...,24.565,166.275,0.76,1.855,1.035,0.265,0.935,0.24
1,1297437077403885568,"{'id': '1297437077403885568 ', 'id_str': '1297...",,"{'following': ['170861207', '23970102', '47293...",[Politics],,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,17685258,"{'id': '17685258 ', 'id_str': '17685258 ', 'na...",[RT @realDonaldTrump: THANK YOU #RNC2020! http...,"{'following': ['46464108', '21536398', '186434...","[Politics, Entertainment, Sports]",rt USER thank rnc2020 LINK sep great takeaways...,20.77,139.155,0.185,1.02,0.48,0.175,0.68,0.485
3,15750898,"{'id': '15750898 ', 'id_str': '15750898 ', 'na...",[A family fears they may have been cheated out...,"{'following': ['2324715174', '24030137', '2336...",[Politics],family fears may cheated 20000 donations raise...,24.53,166.195,0.305,0.5,0.865,0.075,0.51,0.475
4,1659167666,"{'id': '1659167666 ', 'id_str': '1659167666 ',...",[RT @VonteThePlug: Yeah but he ain’t got one h...,"{'following': ['1628313708', '726405625', '130...",[Politics],rt USER yeah ain’t got one happy song nigga al...,13.579545,85.761364,0.068182,0.625,0.670455,0.602273,0.318182,0.545455


In [23]:
tweets = X_transformed["tweets_joined"].tolist()

In [24]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [25]:
print("Original: ", tweets[0])

Original:  rt USER EMOJI ready see newest ship’s name will EMOJI thanks partners helping us unbox name… sep time receipts USER receipt scanners make easy mess stress check LINK LINK sep steady wants encourage invest financial future connect bank account USER access benefits income insights online medical visits cash grants 1000 started today visiting LINK sponsored LINK sep good one USER let’s see yall better come everybody show best handshaq ad LINK LINK sep lsunationalchamps sep stand student athletes wewanttoplay sep wish luck america i’m back sharkweek starts tonight 8p et USER catch taking ultimate plunge shaqattack tomorrow 9p et LINK sep joining tennis champion USER USERs leadingthroughchange join us allstar episode today 10 pt LINK LINK sep let’s today im nominating USER LINK USER USER USER mystartingfive employees amp customers ready vote november take pledge today amp register —gt LINK LINK sep first shaqvsgronk 9 million live viewers helped USER raise money USER USER 125 mil

In [26]:
print("Tokenized: ", tokenizer.tokenize(tweets[0]))

Tokenized:  ['rt', 'user', 'em', '##oj', '##i', 'ready', 'see', 'newest', 'ship', '’', 's', 'name', 'will', 'em', '##oj', '##i', 'thanks', 'partners', 'helping', 'us', 'un', '##box', 'name', '…', 'sep', 'time', 'receipts', 'user', 'receipt', 'scanner', '##s', 'make', 'easy', 'mess', 'stress', 'check', 'link', 'link', 'sep', 'steady', 'wants', 'encourage', 'invest', 'financial', 'future', 'connect', 'bank', 'account', 'user', 'access', 'benefits', 'income', 'insights', 'online', 'medical', 'visits', 'cash', 'grants', '1000', 'started', 'today', 'visiting', 'link', 'sponsored', 'link', 'sep', 'good', 'one', 'user', 'let', '’', 's', 'see', 'ya', '##ll', 'better', 'come', 'everybody', 'show', 'best', 'hands', '##ha', '##q', 'ad', 'link', 'link', 'sep', 'lsu', '##national', '##champ', '##s', 'sep', 'stand', 'student', 'athletes', 'we', '##wan', '##tto', '##play', 'sep', 'wish', 'luck', 'america', 'i', '’', 'm', 'back', 'shark', '##week', 'starts', 'tonight', '8', '##p', 'et', 'user', 'catch

In [27]:
print("Token IDs: ", tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweets[0])))

Token IDs:  [19387, 5310, 7861, 29147, 2072, 3201, 2156, 14751, 2911, 1521, 1055, 2171, 2097, 7861, 29147, 2072, 4283, 5826, 5094, 2149, 4895, 8758, 2171, 1529, 19802, 2051, 28258, 5310, 24306, 26221, 2015, 2191, 3733, 6752, 6911, 4638, 4957, 4957, 19802, 6706, 4122, 8627, 15697, 3361, 2925, 7532, 2924, 4070, 5310, 3229, 6666, 3318, 20062, 3784, 2966, 7879, 5356, 8624, 6694, 2318, 2651, 5873, 4957, 6485, 4957, 19802, 2204, 2028, 5310, 2292, 1521, 1055, 2156, 8038, 3363, 2488, 2272, 7955, 2265, 2190, 2398, 3270, 4160, 4748, 4957, 4957, 19802, 21849, 25434, 25450, 2015, 19802, 3233, 3076, 7576, 2057, 7447, 9284, 13068, 19802, 4299, 6735, 2637, 1045, 1521, 1049, 2067, 11420, 28075, 4627, 3892, 1022, 2361, 3802, 5310, 4608, 2635, 7209, 25912, 21146, 19062, 5946, 3600, 4826, 1023, 2361, 3802, 4957, 19802, 5241, 5093, 3410, 5310, 5198, 2877, 2705, 22494, 5603, 22305, 2063, 3693, 2149, 2035, 14117, 2792, 2651, 2184, 13866, 4957, 4957, 19802, 2292, 1521, 1055, 2651, 10047, 2053, 27932, 5310, 4

In [28]:
max_len = 0
for tweet in tweets:
    input_ids = tokenizer.encode(tweet, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
print("Max length: ", max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (4266 > 512). Running this sequence through the model will result in indexing errors


Max length:  27290


In [58]:
# tokenizing tweets
input_ids = []
attention_masks = []
i = 0
for tweet in tweets:
    if i % 1000 == 0:
        print("iter: ", i)
    encoded_dict = tokenizer.encode_plus(
        tweet,
        add_special_tokens=True,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )
    input_ids.append(encoded_dict["input_ids"])
    attention_masks.append(encoded_dict["attention_mask"])
    i += 1

iter:  0
iter:  1000
iter:  2000
iter:  3000
iter:  4000
iter:  5000
iter:  6000
iter:  7000
iter:  8000


In [59]:
import torch

input_ids = torch.cat(input_ids, dim=0)

In [60]:
attention_masks = torch.cat(attention_masks, dim=0)

In [61]:
labels = train["label"]

In [62]:
labels = torch.tensor(labels)

In [63]:
from torch.utils.data import TensorDataset, random_split

In [64]:
dataset = TensorDataset(input_ids, attention_masks, labels)

In [65]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

In [66]:
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [67]:
print("{:>5,} training samples".format(train_size))
print("{:>5,} validation samples".format(val_size))

7,450 training samples
  828 validation samples


In [68]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32
train_dataloader = DataLoader(
    dataset=train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size
)
val_dataloader = DataLoader(
    dataset=val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size
)

In [69]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
device = torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [74]:
input_ids = input_ids.to(device)
attention_mask = attention_masks.to(device)
labels = labels.to(device)

In [76]:
params = list(model.named_parameters())
print("The BERT model has {:} different named parameters.\n".format(len(params)))
print("==== Embedding Layer ====\n")

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print("\n==== First Transformer ====\n")

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print("\n==== Output Layer ====\n")

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [77]:
optimizer = AdamW(params=model.parameters(), lr=2e-5, eps=1e-8)

In [78]:
from transformers import get_linear_schedule_with_warmup

epochs = 4
total_steps = len(train_dataset) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

In [79]:
import numpy as np


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [80]:
import time
import datetime


def format_time(elapsed):
    elapsed_rounded = int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [81]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [82]:
training_stats = []
total_t0 = time.time()
for epoch in range(0, epochs):
    print("")
    print("======== Epoch {:} / {:} ========".format(epoch + 1, epochs))
    print("Training...")

    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print(
                "  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.".format(
                    step, len(train_dataloader), elapsed
                )
            )

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()

        loss, logits = model(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels,
        )
        total_train_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in val_dataloader:

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            (loss, logits) = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels,
            )

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to("cpu").numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(val_dataloader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            "epoch": epoch + 1,
            "Training Loss": avg_train_loss,
            "Valid. Loss": avg_val_loss,
            "Valid. Accur.": avg_val_accuracy,
            "Training Time": training_time,
            "Validation Time": validation_time,
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))


Training...
