## CS570 Project

In [2]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import torch

2024-04-26 15:44:42.695672: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
#Load the 20 Newsgroups dataset
data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)

In [4]:
# Preprocess the data
texts = data.data
labels = torch.LongTensor(data.target)


In [5]:
# Tokenize the texts
tokenizer = Tokenizer(num_words=500, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [7]:
# Get max training sequence length
maxlen = round(np.mean([len(x) for x in sequences]))

In [8]:
padded =torch.LongTensor(pad_sequences(sequences, padding='post', truncating='post', maxlen=maxlen))


In [9]:
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)


In [10]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.1,random_state = 42)

In [11]:
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super(MultiHeadAttention, self).__init__()
        assert config.n_embd % config.n_head == 0
        
        self.config = config
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        # Embedding layer for tokens
        self.wte = nn.Embedding(config.vocab_size, config.n_embd)

        # Positional embedding layer
        self.wpe = nn.Embedding(config.block_size, config.n_embd)

        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)

        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
                
        #layer_normalization
        self.ln_1 = nn.LayerNorm(config.n_embd, bias=config.bias)
        self.ln_2 = nn.LayerNorm(config.n_embd, bias=config.bias)

        self.drop = nn.Dropout(config.dropout)

        
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu = nn.GELU()
        self.c_mlp = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)

        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)

     

        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                              .view(1, 1, config.block_size, config.block_size))

    def forward(self, idx):
        
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device)  # shape (t)

        # Forward the model itself
        tok_emb = self.wte(idx)  # token embeddings of shape (b, t, n_embd)
        pos_emb = self.wpe(pos)  # position embeddings of shape (t, n_embd)
        x = self.drop(tok_emb + pos_emb)
        
        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)

        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        
     
        x = self.ln_1(x + y)
        m = self.c_fc(x)
        m = self.gelu(m)
        m = self.c_mlp(m)
        m = self.resid_dropout(m)
        x = self.ln_2(x + m)

        return x

In [12]:
class Config:
    def __init__(self):
        self.vocab_size = 5000  # example vocabulary size
        self.n_embd = 128  # embedding dimension
        self.n_head = 4  # number of heads
        self.block_size = 314  # sequence length
        self.dropout = 0.1  # dropout rate
        self.bias = False  # whether to use bias in linear layers

config = Config()

In [13]:
class TextClassifier(nn.Module):
    def __init__(self,config):
        super(TextClassifier, self).__init__()
        self.transformer = MultiHeadAttention(config)        
        self.avg_pool = nn.AdaptiveAvgPool1d(1)
        self.dropout = nn.Dropout(config.dropout)
        self.fc1 = nn.Linear(config.n_embd, 20)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.transformer(x)
        x = self.avg_pool(x.permute(0, 2, 1)).squeeze(-1)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.softmax(x)

        return x


In [14]:
config = Config()
multihead_attn = TextClassifier(config)

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(multihead_attn.parameters(), lr=0.001)

In [16]:
epochs = 10
for epoch in range(epochs):
    total_loss = 0.0
    multihead_attn.train()  # Set model to training mode
    outputs = multihead_attn(X_train)

    optimizer.zero_grad()  # Zero the gradients
    
    loss = F.cross_entropy(outputs, labels)
    
    loss.backward()
    optimizer.step()
    
    total_loss += loss.item()
    
    print(" The loss is :" ,total_loss)


    


: 