<a href="https://colab.research.google.com/github/kgreed4/no_hate_transformer/blob/kgreed/encoder_official.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Define the Transformer components 

class PositionalEncoding(nn.Module):
    def __init__(self, max_len, d_model):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]


class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = self.dropout(torch.relu(self.linear1(x)))
        x = self.linear2(x)
        return x


class MultiHeadedAttention(nn.Module):
    def __init__(self, d_model, h, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.linears = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(4)])
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
                             for l, x in zip(self.linears, (query, key, value))]

        # 2) Apply attention on all the projected vectors in batch.
        x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

    def attention(self, query, key, value, mask=None, dropout=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float))
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        p_attn = torch.nn.functional.softmax(scores, dim=-1)
        if dropout is not None:
            p_attn = dropout(p_attn)
        return torch.matmul(p_attn, value), p_attn


class AddNorm(nn.Module):
    def __init__(self, size, dropout=0.1):
        super(AddNorm, self).__init__()
        self.norm = nn.LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))


class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.add_norm1 = AddNorm(size, dropout)
        self.add_norm2 = AddNorm(size, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = self.add_norm1(x, lambda x: self.self_attn(x, x, x, mask))
        x = self.add_norm2(x, self.feed_forward)
        return x


class TransformerEncoder(nn.Module):
    def __init__(self, encoder_layer, num_layers):
        super(TransformerEncoder, self).__init__()
        self.layers = nn.ModuleList([encoder_layer for _ in range(num_layers)])
        self.num_layers = num_layers

    def forward(self, x, mask=None):
        for i in range(self.num_layers):
            x = self.layers[i](x, mask)
        return x


class TransformerClassifier(nn.Module):
    def __init__(self, d_model, h, d_ff, num_layers, num_classes, max_len, dropout=0.1):
        super(TransformerClassifier, self).__init__()
        self.embedding = BertModel.from_pretrained('bert-base-uncased')
        self.position_encoding = PositionalEncoding(max_len, d_model)
        self.encoder = TransformerEncoder(
            EncoderLayer(d_model, MultiHeadedAttention(d_model, h), FeedForward(d_model, d_ff), dropout),
            num_layers
        )
        self.fc = nn.Linear(d_model, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = self.embedding(x)[0]
        x = self.position_encoding(x)
        x = self.encoder(x, mask)
        x = torch.mean(x, dim=1)  # Aggregate across sequence length
        x = self.dropout(x)
        x = self.fc(x)
        return x


# Function to tokenize the input text using BERT tokenizer
def tokenize_text(text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    return tokenizer.encode(text, add_special_tokens=True)


# Function to train the model
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    train_losses = []
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_loss)
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}")
    return model, train_losses


# Sample usage
if __name__ == "__main__":
    # Assuming you have a DataFrame df with columns ['tweet', 'class']

    # Load the data
    df = pd.read_csv('cleaned_data_nosw.csv')

    # Preprocess data - drop NaN in tweet column
    df.dropna(subset=['tweet'], inplace=True)  # Drop rows where 'tweet' column has NaN values
    df.reset_index(drop=True, inplace=True)    # Reset the index of the DataFrame, and drop the old index

    # Tokenize the text data
    df['tokens'] = df['tweet'].apply(tokenize_text)

    # Split data into train and validation sets
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    # Define dataset and dataloader
    class TextDataset(Dataset):
        def __init__(self, df):
            self.data = df['tokens']
            self.labels = df['class']

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            try:
              data = self.data[idx]
              labels = self.labels[idx]
            except KeyError:
              # Return default values if KeyError occurs
              data = torch.zeros(1, dtype=torch.long)  # Modify the shape and dtype based on your data
              labels = torch.zeros(1, dtype=torch.long)  # Modify the shape and dtype based on your labels
              print(f"KeyError occurred at index {idx}. Returning default values.")
            return data, labels
        
        def collate_fn(self, batch):
            data, labels = zip(*batch)
            if self.max_len is not None:
                data = [d[:self.max_len] for d in data]  # Truncate to max_len
            data_padded = pad_sequence(data, batch_first=True, padding_value=0)
            labels = torch.tensor(labels, dtype=torch.long)
            return data_padded, labels

    train_dataset = TextDataset(train_df)
    train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=train_dataset.collate_fn)

    # Define model hyperparameters
    max_len = 100  # Maximum sequence length
    d_model = 768  # BERT embedding dimension
    h = 4  # Number of attention heads
    d_ff = 2048  # Hidden layer size in feedforward network
    num_layers = 3  # Number of encoder layers
    num_classes = 2  # Number of output classes

    # Initialize model, criterion, and optimizer
    model = TransformerClassifier(d_model, h, d_ff, num_layers, num_classes, max_len)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=2e-5)  # Recommended learning rate for fine-tuning BERT

    # Train the model
    trained_model, train_losses = train_model(model, train_loader, criterion, optimizer)

    # Plot training loss
    plt.plot(train_losses)
    plt.xlabel('Epochs')
    plt.ylabel('Training Loss')
    plt.title('Training Loss')
    plt.show()
