# Import

In [1]:
model_d = 512 # If you want to change the model dimension, modify this parameter

In [None]:
# !pip install transformers # Only for Kaggle or Colab

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.functional import log_softmax
import math
import copy
import time
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [3]:
def show_example(fn, args=[]):
    if __name__ == "__main__":
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__":
        fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None

# Structure

In [4]:
class OnlyDecoder(nn.Module):
    """
    A Decoder architecture.
    """

    def __init__(self, decoder, tgt_embed, generator):
        super(OnlyDecoder, self).__init__()
        self.decoder = decoder
        self.tgt_embed = tgt_embed
        self.generator = generator

    def forward(self, tgt):
        return self.decode(tgt)

    def decode(self, tgt):
        return self.decoder(self.tgt_embed(tgt))

In [5]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
    
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

### Attention

In [6]:
def attention(query, key, value, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value):
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

In [7]:
def attention(query, key, value, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [8]:
N = 6 # The layer of decoder
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return self.norm(x)
        
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
 
    def forward(self, x):
        "Follow Figure 1 (right) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x))
        return self.sublayer[1](x, self.feed_forward)


## Positional embedding

In [9]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))

In [10]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [11]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

## Full model

In [12]:
def make_model_classifier(
    tgt_vocab, label_num, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1
):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = OnlyDecoder(
        Decoder(DecoderLayer(d_model, c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Classifier(d_model, label_num),
    )

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model
    
class Classifier(nn.Module):
    "Define standard linear + softmax generation step."

    def __init__(self, d_model, label_num):
        super(Classifier, self).__init__()
        self.proj = nn.Linear(d_model, label_num)

    def forward(self, x):
        return log_softmax(self.proj(x), dim=-1)
        # return self.proj(x)
    


# Training

In [17]:
# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.data.iloc[index, 0])
        label = int(self.data.iloc[index, 1])
        return {
            'text': text,
            'labels': label
        }

# Training function
def train_model(model, train_loader, optimizer, criterion, device, num_epochs=5, model_save_path=f"classify_model_{model_d}.pt"):
    model.to(device)
    model.train()
    epochs = num_epochs
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    epochs_losses = []
    csv_file_path = f'epochs_losses_{model_d}.csv'
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        running_loss = 0.0
        for sample in tqdm(train_loader):
            texts, labels = sample['text'], sample['labels']
            optimizer.zero_grad()

            inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
            input_ids = inputs['input_ids'].to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(input_ids)
            prob = model.generator(outputs[:, -1])
            # Compute loss
            loss = criterion(prob, labels)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        epoch_loss = running_loss / len(train_loader)
        print(f"Epoch {epoch + 1} Loss: {epoch_loss:.4f}")
        epochs_losses.append(epoch_loss)

        # Save model
        torch.save(model.state_dict(), model_save_path)
        print(f'Model saved as {model_save_path}')
        
    # Convert epoch_losses list to a DataFrame
    loss_df = pd.DataFrame({'Epoch': range(1, num_epochs + 1), 'Loss': epochs_losses})

    # Save the DataFrame to a CSV file
    loss_df.to_csv(csv_file_path, index=False)
    print(f"Epoch losses saved to {csv_file_path}")

In [None]:

# Read csv file
df = pd.read_csv('.\\new_merge.csv', header=None, names=['data', 'labels'])
# Use LabelEncoder 
label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['labels'])
label_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))

# Save label mapping as JSON file
json_file_path = 'label_mapping.json'
with open(json_file_path, 'w') as json_file:
    json.dump(label_mapping, json_file)

print(f"Label mapping saved to {json_file_path}")

# Split the dataset
train_df, _ = train_test_split(df, test_size=0.1, random_state=42)

# Init model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
num_classes = df['labels'].nunique()
model = make_model_classifier(tokenizer.vocab_size,num_classes,d_model=model_d)

# Dataloader
train_dataset = CustomDataset(train_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Optimizer and criterion
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Training
train_model(model, train_loader, optimizer, criterion, device= torch.device("cuda:0"), num_epochs=20)


### Test

In [None]:
m_d = 512 # Model's dimension
model_path = os.pardir + f'\\classify_model_{m_d}.pt'

model = make_model_classifier(tokenizer.vocab_size,num_classes,d_model=m_d)

model.load_state_dict(torch.load(model_path))
print(f"d_model: {m_d}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model.eval()  
model.to(device)

In [21]:
# Read json file
json_file_path = f'label_mapping.json'
with open(json_file_path, 'r') as json_file:
    loaded_label_mapping = json.load(json_file)

In [22]:
text = '''Tom Selleck is advocating for the show he’s been a part of for 14 years.
The actor portrays patriarch Frank Reagan on the CBS police drama “Blue Bloods.” In a recent interview with CBS News, Selleck was asked about whether the show was truly canceled.
“Well, that’s a good question,” Selleck responded. “I will continue to think that CBS will come to their senses.”
“We’re the third-highest scripted show in all of broadcast. We’re winning the night,” Selleck added. “All the cast wants to come back. And I can tell you this: we aren’t sliding off down a cliff. We’re doing good shows, and still holding our place. So, I don’t know. You tell me!”
He is not alone. A vibrant online campaign has been commenced to try and save the series about a family of New York City police officers.
Selleck sounds far from ready to retire to his California ranch, which he said he needs to keep working to afford.
“You know, hopefully I keep working enough to hold onto the place,” he said.
Selleck, 79, was asked if finances were really an issue.
“If I stopped working, yeah,” he said. “Am I set for life? Yeah, but maybe not on a 63-acre ranch!”
CNN has reached out to CBS for comment.'''

In [25]:
print(loaded_label_mapping)

{'0': 'business', '1': 'education', '2': 'entertainment', '3': 'labels', '4': 'politics', '5': 'sports', '6': 'technology'}


In [36]:
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
input_ids = inputs['input_ids'].to(device)

# Forward pass
outputs = model(input_ids)
prob = model.generator(outputs[:, -1])
_, predicted = torch.max(prob, 1)
num_int = int(predicted)
print("Model predicted: ", loaded_label_mapping[str(num_int)])
# print("Model predicted: ", int(predicted))

Model predicted:  entertainment


In [40]:
del model
torch.cuda.empty_cache()