In [182]:
#%pip install torch torchvision torchaudio
import pandas as pd
from transformers import BertTokenizer
from transformers import BertModel
import torch
from torch import nn
import numpy as np


from torch.optim import Adam
from tqdm import tqdm

In [183]:
json_file_path = 'News_Category_Dataset_IS_course.json'

with open(json_file_path, 'r') as file:
    lines = file.readlines()

json_data = '[' + ','.join(lines) + ']'
df = pd.read_json(json_data)
df = df[["short_description", "category"]]
df = df.rename(columns={"short_description": "text", "category": "label"})
df = df.loc[:999, :]

unique_categories = df["label"].unique()
# unique_categories = ['COMEDY' 'PARENTING' 'SPORTS' 'ENTERTAINMENT' 'POLITICS' 'WELLNESS'
#  'BUSINESS' 'STYLE & BEAUTY' 'FOOD & DRINK' 'QUEER VOICES' 'HOME & LIVING'
#  'BLACK VOICES' 'TRAVEL' 'PARENTS' 'HEALTHY LIVING']

category_mapping = {}
for index, category in enumerate(unique_categories):
    category_mapping[category] = index
    
#print(category_mapping)

print(df.head())

  df = pd.read_json(json_data)


                                                text          label
0  "Until you have a dog you don't understand wha...         COMEDY
1  "Accidentally put grown-up toothpaste on my to...      PARENTING
2  Maury Wills, who helped the Los Angeles Dodger...         SPORTS
3  For the past 18 months, Hollywood has effectiv...  ENTERTAINMENT
4  President issues vow as tensions with China rise.       POLITICS


In [184]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

example_text = 'I will watch Memento tonight'
bert_input = tokenizer(example_text,padding='max_length', max_length = 10, 
                       truncation=True, return_tensors="pt")


print(bert_input['input_ids'])
print(bert_input['token_type_ids'])
print(bert_input['attention_mask'])

tensor([[  101,   146,  1209,  2824,  2508, 26173,  3568,   102,     0,     0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])


In [185]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = category_mapping

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):
        self.labels = [labels[label] for label in df['label']]
        self.texts = [tokenizer(
            text = "asd",
            padding='max_length',
            max_length=512,
            truncation=True,
            return_tensors="pt"
        ) for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [186]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

800 100 100


  return bound(*args, **kwds)


In [187]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, len(unique_categories))
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [188]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

  3%|▎         | 11/400 [00:12<07:10,  1.11s/it]


KeyboardInterrupt: 

In [None]:

def evaluate(model, test_data):

    test = Dataset(test_data)
    
    outputs = []

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)
              outputs.append(output)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    
    return outputs
    
evaluate(model, df_test)

Test Accuracy:  0.470


[tensor([[0.0828, 0.0000, 1.5016, 2.7696, 3.1907, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 1.1058, 2.9067, 3.2508, 0.0000, 0.2882, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000]]),
 tensor([[0.0000, 0.0000, 1.7417, 3.0053, 3.8917, 0.0000, 0.4116, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000],
         [0.0754, 0.0000, 1.7596, 2.9623, 3.5517, 0.0000, 1.0431, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000]]),
 tensor([[0.0000, 0.0000, 1.5059, 2.8508, 3.1585, 0.0000, 0.6953, 0.0400, 0.0000,
          0.0000, 0.0000, 0.0000, 0.2372],
         [0.0000, 0.0000, 1.8772, 2.9558, 3.9754, 0.0000, 0.2055, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000]]),
 tensor([[0.5535, 0.0000, 1.2274, 2.5284, 3.4695, 0.0000, 0.5903, 0.0696, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 1.7958, 3.2971, 3.0503, 0.0000, 0.5243, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.3

In [None]:
data = {
    'text': ["Don't you just love it when you hear the words: fun, laughter and park"],
    'label': ["ENTERTAINMENT"]
}

df = pd.DataFrame(data)

result = evaluate(model, df)
print(result)

Test Accuracy:  0.000
[tensor([[0.0427, 0.0000, 2.3858, 2.5399, 3.6643, 0.0000, 0.4824, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.2163]])]
