# BERT Encoder Classifier

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

# Load data
# Assuming df is your DataFrame with 'tweet' and 'class' columns
# Example:
# df = pd.read_csv('your_data.csv')
# df = df[['tweet', 'class']]
# Convert labels to numeric
df['class'] = df['class'].astype(int)

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['tweet'], df['class'], test_size=0.2, random_state=42)

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize input texts
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

# Create PyTorch datasets
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), torch.tensor(train_labels))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']), torch.tensor(val_encodings['attention_mask']), torch.tensor(val_labels))

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Fine-tune BERT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    val_loss = 0
    val_accuracy = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            val_loss += outputs.loss.item()
            preds = torch.argmax(logits, dim=1)
            val_accuracy += torch.sum(preds == labels).item()
    
    val_loss /= len(val_loader)
    val_accuracy /= len(val_dataset)
    print(f'Epoch {epoch + 1}/{num_epochs}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

# Evaluation
# Now you can evaluate the fine-tuned model on your test data or any new data
# Assuming you have test data stored in test_texts and test_labels
# Test data preprocessing is similar to training and validation data
# Then use the model for prediction
# Remember to convert predictions back to your original label format (0 or 1)

test_texts = ["Your test texts here"]
test_labels = [0 or 1]

test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_encodings['attention_mask']))
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask = tuple(t.to(device) for t in batch)[:2]
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.tolist())

# Convert predictions to original label format
# For example, if you trained with labels 0 and 1, and your predictions are 0 and 1
# then you don't need to do any conversion
# Otherwise, you can map your predictions to your original labels
# For example:
# original_labels = ['label_0', 'label_1']
# mapped_predictions = [original_labels[pred] for pred in predictions]

print("Predictions:", predictions)



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader


# Function to tokenize the input text using BERT tokenizer
def tokenize_text(text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    return tokenizer.encode(text, add_special_tokens=True)


# Function to train the model
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    train_losses = []
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_loss)
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}")
    return model, train_losses


# Sample usage
if __name__ == "__main__":
    # Assuming you have a DataFrame df with columns ['tweet', 'class']

    # Load the data
    df = pd.read_csv('cleaned_data_nosw.csv')

    # Preprocess data - drop NaN in tweet column
    df.dropna(subset=['tweet'], inplace=True)  # Drop rows where 'tweet' column has NaN values
    df.reset_index(drop=True, inplace=True)    # Reset the index of the DataFrame, and drop the old index

    # Tokenize the text data
    df['tokens'] = df['tweet'].apply(tokenize_text)

    # Split data into train and validation sets
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    # Define model hyperparameters
    max_len = 100  # Maximum sequence length
    d_model = 768  # BERT embedding dimension
    h = 4  # Number of attention heads
    d_ff = 2048  # Hidden layer size in feedforward network
    num_layers = 3  # Number of encoder layers
    num_classes = 2  # Number of output classes

    # Initialize model, criterion, and optimizer
    from transformers import BertTokenizer, BertForSequenceClassification, AdamW

    # Load pre-trained model (weights)
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels = num_classes,
        output_attentions = False,
        output_hidden_states = False,
    )

    # Load pre-trained model tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Replace the tokenize_text function with a function that uses the BERT tokenizer
    # Assuming tokenize_text function is defined somewhere in the notebook
    def tokenize_text(text):
        return tokenizer.encode_plus(
            text,
            max_length=max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
        )

    # Update the TextDataset class to handle BERT inputs
    class TextDataset(Dataset):
        def __init__(self, df):
            self.data = df['tweet'].apply(lambda x: tokenize_text(x))
            self.labels = df['class']

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            data = self.data.iloc[idx]
            labels = self.labels.iloc[idx]
            input_ids = data['input_ids'].squeeze()
            attention_mask = data['attention_mask'].squeeze()
            return input_ids, attention_mask, labels

    # Reinitialize train_dataset and train_loader with the updated TextDataset class
    train_dataset = TextDataset(train_df)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=2e-5)  # Recommended learning rate for fine-tuning BERT

    # Train the model
    trained_model, train_losses = train_model(model, train_loader, criterion, optimizer)

    # Plot training loss
    plt.plot(train_losses)
    plt.xlabel('Epochs')
    plt.ylabel('Training Loss')
    plt.title('Training Loss')
    plt.show()
