In [7]:
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm
from datasets import load_dataset





In [13]:
url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv'
c=pd.read_csv(url, names=['category', 'title', 'text'])


In [22]:

labels = {1:'politics', 2:'sport', 3:'business', 4:'tech'}
c2 = c.replace({'category': labels})


In [23]:
c2.head()

Unnamed: 0,category,title,text
0,business,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,business,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,business,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,business,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,business,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [32]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
labels = {"business": 0, "entertainment": 1, "sport": 2, "tech": 3, "politics": 4}


class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df["category"]]
        self.texts = [
            tokenizer(
                text,
                padding="max_length",
                max_length=512,
                truncation=True,
                return_tensors="pt",
            )
            for text in df["text"]
        ]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y
class BertClassifier(nn.Module):
    # Initialize BERT Classifier
    def __init__(self, dropout=0.5):
        # Extend the superclass for pre trained BERT Classifier
        super(BertClassifier, self).__init__()

        # Initialize the BERT model. "bert-base-cased" is a pre-trained BERT model, and we are using it to get the benefits of Transfer Learning.
        self.bert = BertModel.from_pretrained("bert-base-cased")

        # Initialize dropout layer: a dropout layer randomly drops out (by setting to zero) a number of output features of the layer during training.
        self.dropout = nn.Dropout(dropout)

        # Initialize a Linear layer: this layer combines input data into a single output through a linear transformation.
        # The linear layer's input dimension matches the output dimension of the BERT model (768), and the output dimension is 5.
        self.linear = nn.Linear(768, 5)

        # Initialize a ReLU (Rectified Linear Unit) activation function: this function will be applied to the output of the linear layer.
        self.relu = nn.ReLU()

    # Define forward pass
    def forward(self, input_id, mask):
        # Pass the input to the BERT model. The BERT model returns the last layer's hidden-state of the first token of the sequence (CLS token) and a "pooled" output (an aggregation of the last layer's hidden state)
        _, pooled_output = self.bert(
            input_ids=input_id, attention_mask=mask, return_dict=False
        )
        # Pass the "pooled" output through the dropout layer
        dropout_output = self.dropout(pooled_output)

        # Pass the output of the dropout layer to the linear layer
        linear_output = self.linear(dropout_output)

        # Apply the ReLU activation function to the output of the linear layer
        final_layer = self.relu(linear_output)

        # Return the output of the final layer
        return final_layer

def train(model, train_data, val_data, learning_rate, epochs, checkpoint=None):
    # Initialize the training and validation datasets
    train, val = Dataset(train_data), Dataset(val_data)

    # Initialize the data loaders for the training and validation datasets
    # The dataloaders will provide batches of data to the model during training.
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    # Check if a GPU is available and if not, use a CPU
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # Initialize the loss function and the optimizer
    # CrossEntropyLoss is often used in multi-class classification problems
    # Adam is a popular choice of optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    # If a GPU is available, move the model and loss function to the GPU
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    # Initialize the global step counter
    global_step = 0
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        print("model sucsessfully loaded: \n")

    for epoch_num in range(epochs):
        # Initialize accumulators for the total training accuracy and loss
        total_acc_train = 0
        total_loss_train = 0
        # Iterate over the batches of the training data loader
        for train_input, train_label in tqdm(train_dataloader):
            # Move the labels and inputs to the GPU if available
            train_label = train_label.to(device)
            mask = train_input["attention_mask"].to(device)
            input_id = train_input["input_ids"].squeeze(1).to(device)

            # Pass the inputs through the model
            output = model(input_id, mask)

            # Calculate the loss of the model's predictions against the true labels
            batch_loss = criterion(output, train_label.long())
            total_loss_train += batch_loss.item()

            # Calculate the accuracy of the model's predictions
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            # Reset the gradients of the model parameters
            model.zero_grad()
            # Perform backpropagation to calculate the gradients
            batch_loss.backward()
            # Update the model parameters
            optimizer.step()

            global_step += 1
            if global_step % 178 == 0:
                torch.save(
                    {
                        "epoch": epoch_num,
                        "model_state_dict": model.state_dict(),
                        "optimizer_state_dict": optimizer.state_dict(),
                        "loss": total_loss_train,
                    },
                    f"src/AI/checkpoints/checkpoint_00_{global_step}.pt",
                )

        # Initialize accumulators for the total validation accuracy and loss
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                # Move the labels and inputs to the GPU if available
                val_label = val_label.to(device)
                mask = val_input["attention_mask"].to(device)
                input_id = val_input["input_ids"].squeeze(1).to(device)

                # Pass the inputs through the model
                output = model(input_id, mask)

                # Calculate the loss of the model's predictions against the true labels
                batch_loss = criterion(output, val_label.long())
                total_loss_val += batch_loss.item()

                # Calculate the accuracy of the model's predictions
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc

            print(
                f"Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}"
            )
            torch.save(
                {
                    "epoch": epoch_num,
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "loss": total_loss_train,
                },
                f"src/AI/checkpoints/checkpoint_00_{global_step}F.pt",
            )

def evaluate(model, test_data):
    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input["attention_mask"].to(device)
            input_id = test_input["input_ids"].squeeze(1).to(device)

            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc

    print(f"Test Accuracy: {total_acc_test / len(test_data): .3f}")

In [33]:
np.random.seed(112)
df = c2.copy()
df = df[:1000]
df_train, df_val, df_test = np.split(
    df.sample(frac=1, random_state=42), [int(0.8 * len(df)), int(0.9 * len(df))]
)


m = BertClassifier()
checkpoing = torch.load(
    "/Users/iansnyder/Desktop/Projects/NER_proj/src/AI/models/model4.pt"
)
m.load_state_dict(checkpoing)
m.eval()
evaluate(m, df_test)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.500
