In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import random

In [None]:
# Dataset Class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, vocab, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = [self.vocab.get(token, self.vocab['<UNK>']) for token in self.tokenizer(text)]
        tokens = tokens[:self.max_len] + [self.vocab['<PAD>']] * (self.max_len - len(tokens))
        return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.long) - 1  # Adjust labels to 0-based index

# Vocabulary Building Function
def build_vocab(texts, tokenizer, max_size=10000):
    frequency = {}
    for text in texts:
        for token in tokenizer(text):
            if token not in frequency:
                frequency[token] = 1
            else:
                frequency[token] += 1
    vocab = {word: i + 2 for i, word in enumerate(sorted(frequency, key=frequency.get, reverse=True)[:max_size])}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

# LSTM Classifier
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, _) = self.lstm(embedded)
        hidden = hidden[-1]
        out = self.fc(hidden)
        return out

# Training and Evaluation Functions
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    model.train()
    for texts, labels in iterator:
        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for texts, labels in iterator:
            predictions = model(texts)
            loss = criterion(predictions, labels)
            acc = accuracy_score(labels.numpy(), predictions.argmax(dim=1).numpy())
            epoch_loss += loss.item()
            epoch_acc += acc
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Load data
train_df = pd.read_csv('data/fulltrain.csv', names=['label', 'text'])
_, sample_df = train_test_split(train_df, test_size=0.1, stratify=train_df['label'], random_state=42)
# Use only the first 1/4 of the training data
# train_df = train_df[:len(train_df) // 4]
test_df = pd.read_csv('data/balancedtest.csv', names=['label', 'text'])

In [None]:
sample_df['label'].value_counts()

In [None]:
# Main
tokenizer = word_tokenize
vocab = build_vocab(pd.concat([sample_df['text'], test_df['text']]), tokenizer)
train_dataset = NewsDataset(sample_df['text'].tolist(), sample_df['label'].tolist(), tokenizer, vocab, max_len=512)
test_dataset = NewsDataset(test_df['text'].tolist(), test_df['label'].tolist(), tokenizer, vocab, max_len=512)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

model = LSTMClassifier(vocab_size=len(vocab), embedding_dim=100, hidden_dim=128, output_dim=4, n_layers=2, dropout=0.5)
# optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
criterion = nn.CrossEntropyLoss()

In [None]:
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_loader, criterion)
    scheduler.step()
    print(f'Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {test_loss:.4f}, Val Acc: {test_acc:.4f}')

# New

In [52]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
from sklearn.model_selection import train_test_split

class CustomModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_length):
        super(CustomModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.conv1d = nn.Sequential(
            nn.Conv1d(embedding_dim, 16, kernel_size=5), 
            nn.ReLU(),
            nn.Dropout(0.5)
            )
        self.global_maxpooling = nn.AdaptiveMaxPool1d(1)
        self.dense = nn.Linear(16, 4)
    
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1d(x)
        x = self.global_maxpooling(x)
        x = self.dense(x.squeeze(2))
        return x

def build_vocab(sequences, vocab_size, oov_token):
    word_to_idx = {"<PAD>": 0, oov_token: 1}
    idx = 2
    word_count = {}
    for sequence in sequences:
        for token in sequence:
            if token not in word_count:
                word_count[token] = 1
            else:
                word_count[token] += 1
    sorted_words = sorted(word_count.keys(), key=lambda x: word_count[x], reverse=True)
    for word in sorted_words[:vocab_size - 2]:
        word_to_idx[word] = idx
        idx += 1
    return word_to_idx

def generate_indices(sequences, word_to_idx):
    oov_tok = "<OOV>"
    return [[word_to_idx.get(token, word_to_idx[oov_tok]) for token in sequence] for sequence in sequences]

def sequences_to_tensors(sequences):
    tensor_sequences = [torch.tensor(sequence) for sequence in sequences]
    padded_sequences = pad_sequence(tensor_sequences, batch_first=True, padding_value=0)
    return padded_sequences

print("Reading data...")
# Data Loading and Preprocessing
train_df = pd.read_csv('data/fulltrain.csv', names=['label', 'title_text'])
test_df = pd.read_csv('data/balancedtest.csv', names=['label', 'title_text'])
# test_df = pd.read_csv('data/fulltrain.csv', names=['label', 'title_text'])
# train_df = pd.read_csv('data/balancedtest.csv', names=['label', 'title_text'])

print("Original training dataset shape: ", train_df.shape)

# Reduce train dataset and maintain label distribution
sample_size = 2500
train_df = train_df.groupby('label').apply(lambda x: x.sample(sample_size)).reset_index(drop=True)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'])
# test_df, _ = train_test_split(test_df, test_size=0.88, stratify=test_df['label'])
# test_df, val_df = train_test_split(test_df, test_size=0.16, stratify=test_df['label'])

train_df['label'] -= 1  # Zero-index labels
test_df['label'] -= 1
val_df['label'] -= 1

print("train_df value counts: ", train_df['label'].value_counts())
print("test__df value counts: ", test_df['label'].value_counts())
print("val___df value counts: ", val_df['label'].value_counts())

print(train_df.head())
print(test_df.head())
print(val_df.head())

print("Tokenising...")
tokenizer = get_tokenizer('basic_english')
train_sequences = [tokenizer(sample) for sample in train_df['title_text']]
test_sequences = [tokenizer(sample) for sample in test_df['title_text']]
val_sequences = [tokenizer(sample) for sample in val_df['title_text']]
print("\n")

print("Building vocab...")
word_to_idx = build_vocab(train_sequences, 10000, "<OOV>")
train_indices = generate_indices(train_sequences, word_to_idx)
test_indices = generate_indices(test_sequences, word_to_idx)
val_indices = generate_indices(val_sequences, word_to_idx)
print("\n")

train_padded = sequences_to_tensors(train_indices)
test_padded = sequences_to_tensors(test_indices)
val_padded = sequences_to_tensors(val_indices)

y_train = torch.tensor(train_df['label'].values).long()
y_test = torch.tensor(test_df['label'].values).long()
y_val = torch.tensor(val_df['label'].values)

train_dataset = TensorDataset(train_padded, y_train)
test_dataset = TensorDataset(test_padded, y_test)
val_dataset = TensorDataset(val_padded, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Model Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomModel(10000, 155, train_padded.shape[1])
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
print("Training...")
num_epochs = 7
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch_inputs, batch_labels in train_loader:
        batch_inputs, batch_labels = batch_inputs.to(device), batch_labels.to(device)
        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += batch_labels.size(0)
        correct += (predicted == batch_labels).sum().item()
    
    # Validation loss calculation
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for data, target in val_loader:
            output = model(data)
            loss = criterion(output, target)
            val_loss += loss.item()

    train_accuracy = 100 * correct / total
    val_loss /= len(val_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Train loss: {total_loss / len(train_loader):.4f}, Train accuracy: {train_accuracy:.2f}%, Val Loss: {val_loss:.4f}')

# Evaluation on Test Data
def evaluate_model(model, loader):
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()
    accuracy = 100 * total_correct / total_samples
    return accuracy

test_accuracy = evaluate_model(model, test_loader)
print(f'Test Accuracy: {test_accuracy:.2f}%')

Reading data...
Original training dataset shape:  (48854, 2)
train_df value counts:  label
1    2000
0    2000
2    2000
3    2000
Name: count, dtype: int64
test__df value counts:  label
0    750
1    750
2    750
3    750
Name: count, dtype: int64
val___df value counts:  label
0    500
3    500
2    500
1    500
Name: count, dtype: int64
      label                                         title_text
4712      1  Obama Tries To Smear Bill OReilly, Big Mistake...
2750      1  Kurt Russell Tells Public He Is A Libertarian ...
2952      1  What Obama Said About ISIS Before The Paris At...
2225      0  As part of an investigation into possible rule...
1110      0  Sources close to Tiger Woods confirmed Friday ...
   label                                         title_text
0      0  When so many actors seem content to churn out ...
1      0   In what football insiders are calling an unex...
2      0  In a freak accident following Game 3 of the N....
3      0  North Koreas official news agen

  train_df = train_df.groupby('label').apply(lambda x: x.sample(sample_size)).reset_index(drop=True)




Building vocab...


Training...
Epoch 1/7, Train loss: 0.7121, Train accuracy: 71.35%, Val Loss: 0.5022
Epoch 2/7, Train loss: 0.3832, Train accuracy: 86.20%, Val Loss: 0.3907
Epoch 3/7, Train loss: 0.2864, Train accuracy: 89.91%, Val Loss: 0.3190
Epoch 4/7, Train loss: 0.2217, Train accuracy: 92.33%, Val Loss: 0.2937
Epoch 5/7, Train loss: 0.1921, Train accuracy: 93.70%, Val Loss: 0.2666
Epoch 6/7, Train loss: 0.1544, Train accuracy: 94.86%, Val Loss: 0.2468
Epoch 7/7, Train loss: 0.1302, Train accuracy: 95.65%, Val Loss: 0.2229
Test Accuracy: 38.00%
