# News Source Attribution With a Convolutional Neural Network

In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk
import gdown

nltk.download('punkt')
nltk.download('punkt_tab')

# File ID and destination
file_id = "1QoqGS6XE8BONzPvUQCVuGjaBdt_GaXlz"
output = "all-the-news-2-1-SMALL-CLEANED.csv"

# Download from Google Drive
#gdown.download(id=file_id, output=output, quiet=False)
data = pd.read_csv("../../../data/all-the-news-2-1-SMALL-CLEANED.csv")

'''
# Read in the data
data = pd.read_csv("../../../data/all-the-news-2-1-SMALL-CLEANED.csv")
'''

# Split into training and testing data
train_data = data[data['split'] == 'train']
test_data = data[data['split'] == 'test']

# Extract text and labels
train_texts = train_data['clean_article'].tolist()
train_labels = train_data['publication'].tolist()
test_texts = test_data['clean_article'].tolist()
test_labels = test_data['publication'].tolist()

# Encode labels
# Use sklearn's LabelEncoder to convert the news sources into numeric values
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.transform(test_labels)

# Get the number of unique labels (classes)
num_labels = len(label_encoder.classes_)

# Tokenize the text
# Returns a list of lists of tokens
def tokenize_texts(texts):
    return [word_tokenize(text.lower()) for text in texts]

# Lists of lists of tokens for each article
train_tokens = tokenize_texts(train_texts)
test_tokens = tokenize_texts(test_texts)

# Get the count of each token in the training data
vocab = Counter(token for tokens in train_tokens for token in tokens)

# Store the "frequency ranking" for each token
vocab = {word: idx + 1 for idx, (word, _) in enumerate(vocab.most_common())}

# Convert tokens to indices
# Returns a list of lists of the "frequency ranking" of each token in each article
def tokens_to_indices(tokens, vocab):
    return [[vocab.get(token, 0) for token in text] for text in tokens]

# Lists of lists of tokens as "frequency rankings" for each article
# These can be interpreted as dense vectors, with few zeros, where each vector entry is the frequency ranking for the corresponding token
train_indices = tokens_to_indices(train_tokens, vocab)
test_indices = tokens_to_indices(test_tokens, vocab)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gpete\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\gpete\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### Padding
At this point, the lengths of our dense article vectors are directly proportional to the article's word count. However, neural networks (including 1D CNNs) require fixed input sizes.

We will use padding to ensure that our input sequences all have the same length.

In [6]:
# This function takes a list of lists (in other words, a list of dense vectors whose values correspond to respective tokens' frequency rankings)
# We will reasonably shorten these vectors to have [max_len] elements
# If a vector is shorter than [max_len], it is safe to append 0's to the end of the vector
# Otherwise, we truncate it to the first [max_len] entries
# Returns a PyTorch tensor, still a 2D matrix, or a list of vectors, but now with the vectors sharing a consistent length
def pad(sequences, max_len):
    return torch.tensor([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in sequences])

# Store padded vectors
max_len = 500
train_padded = pad(train_indices, max_len)
test_padded = pad(test_indices, max_len)

In [8]:
import time

# uses cuda GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare data for PyTorch
class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        first_item = torch.tensor(self.texts[idx]).to(device)
        second_item = torch.tensor(self.labels[idx]).to(device)
        return first_item, second_item

# Create datasets
train_dataset = NewsDataset(train_padded, train_labels)
test_dataset = NewsDataset(test_padded, test_labels)

# Create dataloaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Create a class for our 1D CNN model with an embedding layer
# Use a PyTorch nn.Module to define the architecture of the 1D CNN model
class CNNClassifier(nn.Module):

    # This function initializes the layers of the model
    def __init__(self, vocab_size, embed_dim, num_labels):
        super(CNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)  # Embedding layer
        self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=5, stride=1, padding=2)  # 1D convolutional layer
        self.relu = nn.ReLU()  # Non-linear activation function: ReLU
        self.pool = nn.MaxPool1d(kernel_size=2)  # Max pooling layer: reduces the vector length by a factor of 2, keeping the most important features
        self.fc = nn.Linear(128 * (max_len // 2), num_labels)  # Fully connected layer

    # This function defines how input data flows through the CNN
    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)  # (batch_size, embed_dim, seq_len), convert the vectors into more dense vectors of size embed_dim
        x = self.conv1(x)  # Apply the 1D convolution to extract local patterns (like n-grams, for example)
        x = self.relu(x)  # Apply the non-linear activation function ReLU
        x = self.pool(x)  # Reduce the vector length by a factor of 2
        x = x.view(x.size(0), -1)  # "Flatten" the output into a 2D tensor
        x = self.fc(x)
        return x  # Return a tensor containing the predicted class scores for each sample in the batch

# Initialize the model
vocab_size = len(vocab) + 1  # Add 1 for padding index
embed_dim = 100
model = CNNClassifier(vocab_size, embed_dim, num_labels).to(device)

# Train the model
# Define loss and optimizer
loss_function = nn.CrossEntropyLoss()  # Using the cross-entropy loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adam is a popular optimizer that adapts the learning rate and reduces the loss function

# Training loop
# 1 full pass through the training set is going to take about 1 hour on Colab
# Increase [num_epochs] if time permits
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    batch_times = []

    # For each batch in the training set...
    for i, (texts, labels) in enumerate(train_loader):
        start = time.time()
        optimizer.zero_grad()  # Reset the gradient from the previous step
        outputs = model(texts)  # Get the predicted scores for each class
        loss = loss_function(outputs, labels)  # Calculate the loss
        loss.backward()  # Backpropogation: compute the gradient...
        optimizer.step()  # ...and adjust weights based on the gradient
        total_loss += loss.item()  # Keep track of the total loss
        if len(batch_times) == 100:
          print(f'average batch time = {(sum(batch_times) / len(batch_times)):.2f}s')
          print(f'estimated epoch time = {(sum(batch_times) / len(batch_times)) * len(train_loader):.2f}s')
        elif len(batch_times) < 100:
          batch_times.append(time.time() - start)

    # At the end of each epoch, print the average loss over all batches
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")

  first_item = torch.tensor(self.texts[idx]).to(device)


average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
estimated epoch time = 18.87s
average batch time = 0.01s
esti

In [10]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# Evaluate the model
model.eval() 
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)  # Get the predicted class
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate precision, recall, and F1 score
precision, recall, f1_score, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
accuracy = accuracy_score(all_labels, all_preds)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")

print("=== Per-Class Metrics ===")
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

  first_item = torch.tensor(self.texts[idx]).to(device)


Accuracy: 0.8586
Precision: 0.8593
Recall: 0.8586
F1 Score: 0.8584
=== Per-Class Metrics ===
                    precision    recall  f1-score   support

     Buzzfeed News       0.70      0.72      0.71      1000
               CNN       0.83      0.85      0.84      1000
         Economist       0.88      0.99      0.93      1000
          Fox News       0.90      0.91      0.91      1000
            People       0.92      0.89      0.90      1000
          Politico       0.76      0.75      0.75      1000
           Reuters       0.95      0.94      0.94      1000
          The Hill       0.91      0.88      0.90      1000
The New York Times       0.89      0.86      0.87      1000
              Vice       0.86      0.80      0.83      1000

          accuracy                           0.86     10000
         macro avg       0.86      0.86      0.86     10000
      weighted avg       0.86      0.86      0.86     10000

