# Using Tweet Language for Twitter Bot Detection

Inspired by Wei and Nguyen's 2019 paper [Twitter Bot Detection Using Bidirectional Long Short-term Memory Neural Networks and Word Embeddings](https://drive.google.com/file/d/1rlasxgfruK8KmvaQB9ekTRJYeD3zOnL9/view?usp=share_link)

## Configuration

Authorize Google Drive to load in dataset

In [1]:
from google.colab import auth
auth.authenticate_user()
project_id = 'cse6242-381122'
!gcloud config set project {project_id}
!gsutil ls

Updated property [core/project].
gs://emp-1867/
gs://hmlaidsn/


In [2]:
bucket_id = 'emp-1867'
!gsutil cp gs://{bucket_id}/TwiBot-20_sample.json gs://{bucket_id}/label.csv gs://{bucket_id}/glove.twitter.27B/glove.twitter.27B.25d.txt .

Copying gs://emp-1867/TwiBot-20_sample.json...
Copying gs://emp-1867/label.csv...
Copying gs://emp-1867/glove.twitter.27B/glove.twitter.27B.25d.txt...
==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

- [3 files][249.0 MiB/249.0 MiB]   18.8 MiB/s                                   
Operation completed over 3 objects/249.0 MiB.                                    


## Set-Up

### Import

In [60]:
import re
import string
from collections import OrderedDict

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe, vocab

In [5]:
if torch.cuda.is_available():
  device = torch.device("cuda")
  torch.cuda.get_device_name(0)
else:
  device = torch.device("cpu")

### Data

Load sample dataset found from [Feng](https://github.com/BunsenFeng/TwiBot-20) in json format to Pandas dataframe

In [6]:
dataset = pd.read_json('./TwiBot-20_sample.json')
dataset

Unnamed: 0,ID,profile,tweet,neighbor,domain
0,17461978,"{'id': '17461978 ', 'id_str': '17461978 ', 'na...",[RT @CarnivalCruise: 🎉 Are you ready to see wh...,,"[Politics, Bussiness, Entertainment]"
1,1297437077403885568,"{'id': '1297437077403885568 ', 'id_str': '1297...",,"{'following': ['170861207', '23970102', '47293...",[Politics]
2,17685258,"{'id': '17685258 ', 'id_str': '17685258 ', 'na...",[RT @realDonaldTrump: THANK YOU #RNC2020! http...,"{'following': ['46464108', '21536398', '186434...","[Politics, Entertainment, Sports]"
3,15750898,"{'id': '15750898 ', 'id_str': '15750898 ', 'na...",[A family fears they may have been cheated out...,"{'following': ['2324715174', '24030137', '2336...",[Politics]
4,1659167666,"{'id': '1659167666 ', 'id_str': '1659167666 ',...",[RT @VonteThePlug: Yeah but he ain’t got one h...,"{'following': ['1628313708', '726405625', '130...",[Politics]
...,...,...,...,...,...
95,843514885644271616,"{'id': '843514885644271616 ', 'id_str': '84351...","[@doriandetails https://t.co/iiW9NlqTmZ\n, @us...","{'following': ['1263585754665504770', '1288883...",[Politics]
96,24578794,"{'id': '24578794 ', 'id_str': '24578794 ', 'na...",[The Vichy Republicans were embarrassing last ...,"{'following': ['2361631088', '60190930', '1451...","[Politics, Entertainment, Sports]"
97,39349894,"{'id': '39349894 ', 'id_str': '39349894 ', 'na...","[Without any basis, the NYAG has pledged to ta...","{'following': ['17454769', '17074440', '581999...","[Politics, Bussiness, Entertainment, Sports]"
98,48223726,"{'id': '48223726 ', 'id_str': '48223726 ', 'na...",[Take a moment to soak it all in with one of o...,"{'following': ['18462058', '14957318', '145879...",[Politics]


In [7]:
labels_df = pd.read_csv('./label.csv')
labels_df['id'] = labels_df['id'].str.extract(r'([0-9]+)')
labels_df['ID'] = pd.to_numeric(labels_df['id'])

labels_df.head()

Unnamed: 0,id,label,ID
0,17461978,human,17461978
1,1297437077403885568,bot,1297437077403885568
2,17685258,human,17685258
3,15750898,human,15750898
4,1659167666,bot,1659167666


In [9]:
df = pd.merge(dataset[['ID', 'tweet']], labels_df[['ID', 'label']], on='ID', how='left')
df = df.dropna(axis=0, subset=['tweet'])
df

Unnamed: 0,ID,tweet,label
0,17461978,[RT @CarnivalCruise: 🎉 Are you ready to see wh...,human
2,17685258,[RT @realDonaldTrump: THANK YOU #RNC2020! http...,human
3,15750898,[A family fears they may have been cheated out...,human
4,1659167666,[RT @VonteThePlug: Yeah but he ain’t got one h...,bot
5,34743251,"[RT @elonmusk: Aloha, welcome back from space ...",human
...,...,...,...
95,843514885644271616,"[@doriandetails https://t.co/iiW9NlqTmZ\n, @us...",bot
96,24578794,[The Vichy Republicans were embarrassing last ...,human
97,39349894,"[Without any basis, the NYAG has pledged to ta...",human
98,48223726,[Take a moment to soak it all in with one of o...,human


In [29]:
train, vt = train_test_split(df, test_size=0.2)
valid, test = train_test_split(vt, test_size=0.5)

### Sample User

Look at the properties of a sample user.

The first example is a human, the second example is a bot.

In [11]:
print("User ", df.iloc[0]['ID'], ", ", len(df.iloc[0]['tweet']), " tweets, ", df.iloc[0]['label'], "\n")
df.iloc[0]['tweet'][0:5]

User  17461978 ,  200  tweets,  human 



['RT @CarnivalCruise: 🎉 Are you ready to see what our newest ship’s name will be? 🎉 Thanks to all our partners for helping us unbox the name.…\n',
 'Who has time for receipts? Not me. @epson receipt scanners make it easy. No mess = no stress! Check it out at https://t.co/ofqbTdz0Qk. https://t.co/BtYwuyz9N5\n',
 'Steady wants to encourage you to invest in your financial future. Connect your bank account to @TheSteadyApp and have access to benefits like income insights, online medical visits, and cash grants of up to $1,000! Get started today by visiting https://t.co/5w3AvWd8Q0. #Sponsored https://t.co/ZHBfP5xDMg\n',
 "Good one, @rishid. But let’s see if y'all can do better than that. Come on, everybody. Show me your best #HandShaq! #ad https://t.co/xCloeLRfuM https://t.co/urVSOfTmT2\n",
 '#lsunationalchamps\n']

In [12]:
print("User ", df.iloc[3]['ID'], ", ", len(df.iloc[3]['tweet']), " tweets, ", df.iloc[3]['label'], "\n")
df.iloc[3]['tweet'][0:5]

User  1659167666 ,  88  tweets,  bot 



['RT @VonteThePlug: Yeah but he ain’t got one happy song nigga always distraught 😂\n',
 'RT @VonteThePlug: I’m tryna tell y’all I’m really like that but don’t take my word take a listen 😚🔥 https://t.co/1McoyAVcqh\n',
 'RT @VonteThePlug: Available on all platforms go get it 👇🏾 it’s heat I swear 🔥🤧https://t.co/lRDn7UGwNI\n',
 'RT @VonteThePlug: Boy fuck you\n',
 'RT @VonteThePlug: Bodied Da Baby beat. Ight Twitter y’all know what to do 🔥 https://t.co/nLZAzQUziG\n']

## Data Pre-Processing

Wei and Nguyen used GloVe word embeddings. Special words like Hashtags (#), mentions (@), and shortened URLs (http://t.co) were mapped to specific tokens.

### Dating cleaning

In [13]:
def clean_text(input):
    lowercase = input.lower()
    url_token = re.sub('https://t.co\/[^\s]+', ' <url> ', lowercase)
    hashtag_token = re.sub('#[^\s]+', ' <hashtag> ', url_token)
    mention_token = re.sub('@[^\s]+', ' <user> ', hashtag_token)
    retweet_token = re.sub(r'rt\s+', ' rt ', mention_token)
    strip_punc = re.sub('[^a-zA-Z0-9\<\>]', ' ', retweet_token)
    strip_num = re.sub('(?<=\\d) +(?=\\d)', '', strip_punc)
    num = re.sub('\d+', ' <number> ', strip_num)
    strip_nl = re.sub(r'\s\s+', ' ', num)
    return strip_nl

In [14]:
df['tweet_comb'] = df['tweet'].apply(lambda x: ' <SEP> '.join(x))
df['tweet_clean'] = df['tweet_comb'].apply(lambda x: clean_text(x))

In [15]:
df['tweet_clean'].iloc[0]

' rt <user> are you ready to see what our newest ship s name will be thanks to all our partners for helping us unbox the name <sep> who has time for receipts not me <user> receipt scanners make it easy no mess no stress check it out at <url> <url> <sep> steady wants to encourage you to invest in your financial future connect your bank account to <user> and have access to benefits like income insights online medical visits and cash grants of up to <number> get started today by visiting <url> <hashtag> <url> <sep> good one <user> but let s see if y all can do better than that come on everybody show me your best <hashtag> <hashtag> <url> <url> <sep> <hashtag> <sep> i stand with the student athletes <hashtag> <sep> wish me luck america i m back for <hashtag> it all starts tonight at <number> p et on <user> and catch me taking the ultimate plunge on shaqattack tomorrow at <number> p et <url> <sep> i am joining tennis champion <user> on <user> <hashtag> join us for an all star episode today 

### Tokenize Text and Convert to GloVe Embeddings

In [16]:
tokenizer = get_tokenizer('basic_english')
test = tokenizer(df.iloc[0]['tweet_clean'])
test[0:5]

['rt', '<user>', 'are', 'you', 'ready']

In [17]:
special_tokens = ['<unk>', '<sep>', '<user>', '<hashtag>', '<url>', '<number>', '<repeat>', 'rt']

In [18]:
def load_glove_vec(File):
    print("Loading Glove Model")
    glove = {}
    with open(File,'r') as f:
        em = 0
        for line in f:
            split_line = line.split()
            word = split_line[0]
            # embedding = torch.tensor(split_line[1:], dtype=torch.float64)
            glove[word] = em
            em += 1
    print(f"{len(glove)} words loaded!")
    return glove

In [19]:
glove = load_glove_vec("/content/glove.twitter.27B.25d.txt")

Loading Glove Model
1193514 words loaded!


In [20]:
# type(glove_vec)

In [21]:
i = 0
for r in glove.items():
    print(r)
    if i == 10:
      break
    else:
      i += 1

print(len(glove))

('<user>', 0)
('.', 1)
(':', 2)
('rt', 3)
(',', 4)
('<repeat>', 5)
('<hashtag>', 6)
('<number>', 7)
('<url>', 8)
('!', 9)
('i', 10)
1193514


In [22]:
glove_vocab = vocab(glove, specials=special_tokens, special_first=True)
glove_vocab.set_default_index(0)

Make sure the tokenizer and embeddings are working properly

In [23]:
test_tok = tokenizer(df['tweet_clean'].iloc[0])
test_vec = glove_vocab(test_tok)
print(test_tok[0:50])
print(test_vec[0:50])

['rt', '<user>', 'are', 'you', 'ready', 'to', 'see', 'what', 'our', 'newest', 'ship', 's', 'name', 'will', 'be', 'thanks', 'to', 'all', 'our', 'partners', 'for', 'helping', 'us', 'unbox', 'the', 'name', '<sep>', 'who', 'has', 'time', 'for', 'receipts', 'not', 'me', '<user>', 'receipt', 'scanners', 'make', 'it', 'easy', 'no', 'mess', 'no', 'stress', 'check', 'it', 'out', 'at', '<url>', '<url>']
[7, 2, 72, 17, 581, 18, 165, 88, 279, 10770, 4414, 139, 592, 130, 58, 271, 18, 77, 279, 14157, 39, 4818, 293, 282559, 15, 592, 1, 129, 217, 137, 39, 55282, 80, 23, 2, 45599, 162622, 185, 35, 1229, 32, 2737, 32, 2287, 527, 35, 101, 68, 4, 4]


Create the pipeline for the data

In [42]:
text_pipeline = lambda x: glove_vocab(tokenizer(x))
label_pipeline = lambda x: 1 if x == 'bot' else 0 ## Human = 0, Bot = 1

### Load Data in as Torch Tensors

In [25]:
MAX_SEQ_LEN = 280 * 200
BATCH_SIZE = 8

In [26]:
# train.to_csv('/content/results' + '/train.csv', index=False)
# valid.to_csv('/content/results' + '/valid.csv', index=False)
# test.to_csv('/content/results' + '/test.csv', index=False)

In [37]:
class TweetDataset(Dataset):
    def __init__(self, df):
        self.texts = df['tweet_clean']
        self.labels = df['label']

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        t = self.texts.iloc[idx]
        l = self.labels.iloc[idx]
        return t, l

In [38]:
train_ds = TweetDataset(train)
valid_ds = TweetDataset(valid)
test_ds = TweetDataset(test)

In [47]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_text, _label) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [48]:
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, collate_fn=collate_batch)
valid_loader = DataLoader(valid_ds, batch_size=BATCH_SIZE, collate_fn=collate_batch)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, collate_fn=collate_batch)

In [49]:
for (i, (labels, texts, _)) in enumerate(train_loader):
    print("Vec: ", texts, " Label: ", labels)
    if i > 10:
      break

Vec:  tensor([  197,    12,   184,  ...,    12,    74, 71602])  Label:  tensor([0, 0, 0, 0, 1, 0, 1, 0])
Vec:  tensor([   7,    2,   74,  ...,    2, 2258,  295])  Label:  tensor([0, 1, 0, 1, 1, 0, 0, 0])
Vec:  tensor([ 57, 567,  18,  ..., 599,   3,   4])  Label:  tensor([0, 0, 1, 0, 0, 0, 0, 0])
Vec:  tensor([     4,      1,      4,  ...,   7510,  95880, 375545])  Label:  tensor([0, 0, 0, 1, 0, 0, 0, 0])
Vec:  tensor([    7,     2, 16781,  ...,    72,    57,   210])  Label:  tensor([0, 1, 1, 0, 1, 0, 0, 0])
Vec:  tensor([   7,    2,   57,  ...,   13, 1205, 1052])  Label:  tensor([0, 0, 0, 0, 0, 0, 0, 0])
Vec:  tensor([   7,    2, 5129,  ...,  231,    4,    4])  Label:  tensor([0, 0, 1, 1, 0, 0, 0, 0])
Vec:  tensor([   2, 1761,   41,  ..., 1817,  556,  271])  Label:  tensor([0, 0, 0, 0, 0, 0, 0, 0])
Vec:  tensor([  928,    48,    13,  ...,  1592, 51494,     4])  Label:  tensor([0, 0, 1, 0, 0, 0, 0, 0])
Vec:  tensor([  2,   2,  17,  ...,  45, 123,  35])  Label:  tensor([1, 0, 0, 0, 1, 1,

## Build Model

In [50]:
VOCAB_SIZE = len(glove_vocab)
EMBEDDING_DIM = 25
HIDDEN_DIM = 128
OUTPUT_DIM = 100

In [111]:
class BiLSTM(nn.Module):
    def __init__(self):
        super(BiLSTM, self).__init__()
    
        self.batch_size = BATCH_SIZE
        self.embedding_dim = EMBEDDING_DIM
        self.hidden_dim = HIDDEN_DIM
        self.vocab_size = VOCAB_SIZE
        self.num_layers = 3
        self.sequence_len = 280

        # Layers
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)        # Embedding layer - OUT(25)
        # Bi-LSTM
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, num_layers=self.num_layers,
                             bidirectional=True, dropout=0.25, batch_first=True)  # LSTM layer - OUT(128)
        self.dropout = nn.Dropout(0.25)                                           # Dropout
        self.softmax = nn.Softmax(dim=1)                                          # softmax normalization layer

    def init_hidden(self):
      return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))

    def forward(self, x):
        embed = self.embedding(x)                 # Embedding the vectors
        lstm_out, self.hidden = self.lstm(embed)  # Pass through the three layer BiLSTM
        y_pred = self.softmax(lstm_out)           # softmax normalization
        return y_pred

## Train Model

### Configure Checkpoints

### Train Method

In [81]:
import time
from tqdm import tqdm

In [112]:
def train(dataloader, model, optimizer, criterion, num_epochs):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for epoch in tqdm(range(num_epochs)):
        epoch_start_time = time.time()

        for idx, (label, text, _) in enumerate(dataloader):
            optimizer.zero_grad()
            predicted_label = model(text)
            loss = criterion(predicted_label, label.squeeze())
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
            optimizer.step()
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            if idx % log_interval == 0 and idx > 0:
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches '
                      '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
        total_acc, total_count = 0, 0
        start_time = time.time()

In [None]:
model = BiLSTM().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.NLLLoss()

train(dataloader=train_loader, model=model, optimizer=optimizer, criterion=criterion, num_epochs=10)

  0%|          | 0/10 [00:00<?, ?it/s]

### Plot

In [None]:
# train_loss_list, valid_loss_list, global_steps_list = load_metrics('/content/checkpoints' + '/metrics.pt')
# plt.plot(global_steps_list, train_loss_list, label='Train')
# plt.plot(global_steps_list, valid_loss_list, label='Valid')
# plt.xlabel('Global Steps')
# plt.ylabel('Loss')
# plt.legend()
# plt.show() 