In [34]:
from datasets import load_dataset
from torch.utils.data import random_split
import torch
from collections import Counter
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

In [26]:

ds = load_dataset("ucirvine/sms_spam")

In [27]:
def create_vocabulary(texts, max_words=10000):
    # Flatten all words and count them
    all_words = ' '.join(texts).lower().split()
    word_counts = Counter(all_words)
    # Get most common words
    vocab = ['<PAD>', '<UNK>'] + [word for word, _ in word_counts.most_common(max_words-2)]
    # Create word to index mapping
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    return word2idx


In [66]:
class SMSDataset(Dataset):
    def __init__(self, dataset, word2idx, max_length=50):
        self.dataset = dataset  # Keep the whole dataset
        self.word2idx = word2idx
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # Access the dataset directly
        item = self.dataset[idx]
        text = str(item['sms']).lower().split()
        
        # Convert words to indices
        indices = [self.word2idx.get(word, self.word2idx['<UNK>']) for word in text]
        
        # Pad or truncate
        if len(indices) < self.max_length:
            indices += [self.word2idx['<PAD>']] * (self.max_length - len(indices))
        else:
            indices = indices[:self.max_length]
            
        return {
            'text': torch.tensor(indices, dtype=torch.long),
            'label': torch.tensor(item['label'], dtype=torch.long)
        }

In [67]:
# 3. LSTM Model
class SMSClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=64, n_layers=2, dropout=0.2):
        super(SMSClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           n_layers, 
                           batch_first=True, 
                           dropout=dropout if n_layers > 1 else 0,
                           bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        # Multiply hidden_dim by 2 for bidirectional
        self.fc = nn.Linear(hidden_dim * 2, 2)
        
    def forward(self, x):
        # x shape: (batch_size, sequence_length)
        embedded = self.embedding(x)
        # embedded shape: (batch_size, sequence_length, embedding_dim)
        
        lstm_out, (hidden, cell) = self.lstm(embedded)
        # Use the final hidden state from both directions
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        out = self.dropout(hidden)
        return self.fc(out)

In [68]:
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        total = 0
        
        for batch in train_loader:
            print(batch['sms'])
            texts = batch['sms'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        epoch_loss = total_loss / len(train_loader)
        epoch_acc = correct / total
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')

In [69]:
full_dataset = ds['train']
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size

train_dataset, test_dataset = random_split(
    full_dataset, 
    [train_size, test_size],
    generator=torch.Generator().manual_seed(42)
)

# Create vocabulary
word2idx = create_vocabulary([train_dataset[i]['sms'] for i in range(len(train_dataset))])
vocab_size = len(word2idx)

In [70]:
print(word2idx)



In [71]:
# Create SMS datasets
train_data = SMSDataset(
    [train_dataset[i]['sms'] for i in range(len(train_dataset))],
    [train_dataset[i]['label'] for i in range(len(train_dataset))],
    word2idx
)

test_data = SMSDataset(
    [test_dataset[i]['sms'] for i in range(len(test_dataset))],
    [test_dataset[i]['label'] for i in range(len(test_dataset))],
    word2idx
)

In [72]:
# Create dataloaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [73]:
# Initialize model, loss function, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SMSClassifier(vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

In [74]:
for x in train_dataset:
    print(x)

{'sms': 'Then what about further plan?\n', 'label': 0}
{'sms': 'My sister in law, hope you are having a great month. Just saying hey. Abiola\n', 'label': 0}
{'sms': 'All these nice new shirts and the only thing I can wear them to is nudist themed ;_; you in mu?\n', 'label': 0}
{'sms': "Lol now I'm after that hot air balloon!\n", 'label': 0}
{'sms': 'Are you sure you don\'t mean "get here, we made you hold all the weed"\n', 'label': 0}
{'sms': 'May i call You later Pls\n', 'label': 0}
{'sms': 'Here got ur favorite oyster... N got my favorite sashimi... Ok lar i dun say already... Wait ur stomach start rumbling...\n', 'label': 0}
{'sms': 'Awww dat is sweet! We can think of something to do he he! Have a nice time tonight ill probably txt u later cos im lonely :( xxx.\n', 'label': 0}
{'sms': 'Will be office around 4 pm. Now i am going hospital.\n', 'label': 0}
{'sms': 'Hey , is * rite u put »10 evey mnth is that all?\n', 'label': 0}
{'sms': 'URGENT! Your mobile number *************** WON a