# News Source Attribution With a Convolutional Neural Network

In [None]:
# Import necessary libraries

import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

# Read in the data
data = pd.read_csv("../../../data/all-the-news-2-1-SMALL-CLEANED.csv")

# Split into training and testing data
train_data = data[data['split'] == 'train']
test_data = data[data['split'] == 'test']

# Extract text and labels
train_texts = train_data['clean_article'].tolist()
train_labels = train_data['publication'].tolist()
test_texts = test_data['clean_article'].tolist()
test_labels = test_data['publication'].tolist()

# Encode labels
# Use sklearn's LabelEncoder to convert the news sources into numeric values
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.transform(test_labels)

# Get the number of unique labels (classes)
num_labels = len(label_encoder.classes_)

# Tokenize the text
# Returns a list of lists of tokens
def tokenize_texts(texts):
    return [word_tokenize(text.lower()) for text in texts]

# Lists of lists of tokens for each article
train_tokens = tokenize_texts(train_texts)
test_tokens = tokenize_texts(test_texts)

# Get the count of each token in the training data
vocab = Counter(token for tokens in train_tokens for token in tokens)

# Store the "frequency ranking" for each token
vocab = {word: idx + 1 for idx, (word, _) in enumerate(vocab.most_common())}

# Convert tokens to indices
# Returns a list of lists of the "frequency ranking" of each token in each article
def tokens_to_indices(tokens, vocab):
    return [[vocab.get(token, 0) for token in text] for text in tokens]

# Lists of lists of tokens as "frequency rankings" for each article
# These can be interpreted as dense vectors, with few zeros, where each vector entry is the frequency ranking for the corresponding token
train_indices = tokens_to_indices(train_tokens, vocab)
test_indices = tokens_to_indices(test_tokens, vocab)

### Padding
At this point, the lengths of our dense article vectors are directly proportional to the article's word count. However, neural networks (including 1D CNNs) require fixed input sizes.

We will use padding to ensure that our input sequences all have the same length.

In [None]:
# This function takes a list of lists (in other words, a list of dense vectors whose values correspond to respective tokens' frequency rankings)
# We will reasonably shorten these vectors to have [max_len] elements
# If a vector is shorter than [max_len], it is safe to append 0's to the end of the vector
# Otherwise, we truncate it to the first [max_len] entries
# Returns a PyTorch tensor, still a 2D matrix, or a list of vectors, but now with the vectors sharing a consistent length
def pad(sequences, max_len):
    return torch.tensor([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in sequences])

# Store padded vectors
max_len = 500
train_padded = pad(train_indices, max_len)
test_padded = pad(test_indices, max_len)

In [None]:
# Prepare data for PyTorch
class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx]), torch.tensor(self.labels[idx])

# Create datasets
train_dataset = NewsDataset(train_padded, train_labels)
test_dataset = NewsDataset(test_padded, test_labels)

# Create dataloaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Create a class for our 1D CNN model with an embedding layer
class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(CNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=5, stride=1, padding=2)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.fc = nn.Linear(128 * (max_len // 2), num_classes)
    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)  # (batch_size, embed_dim, seq_len)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc(x)
        return x

# Initialize the model
vocab_size = len(vocab) + 1  # Add 1 for padding index
embed_dim = 100
model = CNNClassifier(vocab_size, embed_dim, num_labels)