In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers

In [5]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW

In [None]:
# Load the data
df = pd.read_csv("/content/drive/MyDrive/Real Time ML Final Project/data/completeTWEETdataset.csv")

# Convert text labels to numerical values
df['label'] = df['account.type'].apply(lambda x: 1 if x=='bot' else 0)

# Tokenize the text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

encoded_data = tokenizer.batch_encode_plus(
    df.text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']
labels = torch.tensor(df.label.values)

In [7]:
# Split the data into training and validation sets
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [None]:
# Create the data loaders
batch_size = 32
train_dataloader = DataLoader(train_dataset, 
                              sampler = RandomSampler(train_dataset), 
                              batch_size = batch_size)

validation_dataloader = DataLoader(val_dataset, 
                                   sampler = SequentialSampler(val_dataset), 
                                   batch_size = batch_size)

In [None]:
# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

# Set the optimizer and the learning rate
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

In [None]:
import matplotlib.pyplot as plt

# Fine-tune the model
epochs = 2

train_loss_values = []
val_loss_values = []
train_acc_values = []
val_acc_values = []

for epoch in range(epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    model.train()
    total_loss = 0
    total_correct = 0

    for step, batch in enumerate(train_dataloader):
        batch_input_ids = batch[0]
        batch_input_mask = batch[1]
        batch_labels = batch[2]
        model.zero_grad()        
        outputs = model(batch_input_ids, 
                        token_type_ids=None, 
                        attention_mask=batch_input_mask, 
                        labels=batch_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        pred = outputs[1]
        labels = batch_labels
        total_correct += np.sum(np.argmax(pred, axis=1) == labels)

        if step % 50 == 0:
            print('Batch {:>5,}  of  {:>5,}. Loss: {:>0.5f}'.format(step, len(train_dataloader), loss.item()))

    train_loss = total_loss / len(train_dataloader)
    train_acc = total_correct / len(train_dataset)
    train_loss_values.append(train_loss)
    train_acc_values.append(train_acc)

    print("Train loss: {0:.4f}".format(train_loss))
    print("Train accuracy: {0:.4f}".format(train_acc))

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    total_eval_correct = 0

    for batch in validation_dataloader:
        batch_input_ids = batch[0]
        batch_input_mask = batch[1]
        batch_labels = batch[2]
        with torch.no_grad():        
            outputs = model(batch_input_ids, 
                            token_type_ids=None, 
                            attention_mask=batch_input_mask, 
                            labels=batch_labels)
            loss = outputs[0]
            total_eval_loss += loss.item()
            pred = outputs[1]
            labels = batch_labels
            total_eval_correct += np.sum(np.argmax(pred, axis=1) == labels)

    val_loss = total_eval_loss / len(validation_dataloader)
    val_acc = total_eval_correct / len(val_dataset)
    val_loss_values.append(val_loss)
    val_acc_values.append(val_acc)

    print("Val loss: {0:.4f}".format(val_loss))
    print("Val accuracy: {0:.4f}".format(val_acc))

# Plot the train and validation loss and accuracy
plt.plot(train_loss_values, label='Train Loss')
plt.plot(val_loss_values, label='Val Loss')
plt.legend()
plt.show()

plt.plot(train_acc_values, label='Train Acc')
plt.plot(val_acc_values, label='Val Acc')
plt.legend()
plt.show()



## Inference

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load the fine-tuned model
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Real Time ML Final Project")

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [9]:
# Define the input text
text = "just aother terrible day at school, excited to get over this"

# Tokenize the input text
tokens = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')

# Make a prediction on the input text
outputs = model(tokens['input_ids'], attention_mask=tokens['attention_mask'])
probs = outputs[0].softmax(1)
predicted_class = probs.argmax(1)

# Print the predicted class
print(predicted_class.item())

0
