In [164]:
import requests
import json
import pandas as pd

In [165]:
api_key = 'CHTCbuAHiAhAYlxD8mkLDE2m6QnNIVpG'

In [166]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer

class BertForSentimentClassification(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_classes=2):
        super(BertForSentimentClassification, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.classifier(x)
        return logits

In [167]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer

def predict_sentiment(model, tokenizer, text, device=None):
    """
    Make softmax predictions on input text using the trained BERT model.

    Args:
    - model: Trained BertForSentimentClassification model
    - text: Input text string
    - device: torch.device (if None, will use CUDA if available)

    Returns:
    - predictions: Dict containing softmax probabilities and predicted class
    """
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model.eval()
    model.to(device)

    # Tokenize input text
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')

    # Move input to device
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    # Make prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Apply softmax to get probabilities
    probabilities = F.softmax(outputs, dim=1)

    # Convert to numpy for easier handling
    probabilities = probabilities.cpu().numpy()[0]

    return [float(probabilities[0]), float(probabilities[1]), float(probabilities[1]) - float(probabilities[0])]


# Usage example:
# Assuming you have a trained model
model = torch.load('model.pt', map_location=torch.device('cpu'))

In [168]:
url = f'https://api.nytimes.com/svc/archive/v1/2024/1.json?api-key={api_key}'

In [169]:
# sending get request and saving the response as response object
r = requests.get(url = url)
 
# extracting data in json format
data = r.json()

In [170]:
data = pd.DataFrame(data['response']['docs'])

In [171]:
data = data[data['type_of_material'] == 'News']

In [172]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

(0.9299237728118896, 0.07007624208927155, -0.8598475307226181)


In [173]:
categories = [
    'Business Day',
    'World',
    'Arts',
    'Times Insider',
    'U.S.',
    'Travel',
    'Style',
    'Food',
    'Real Estate',
    'Movies',
    'Briefing',
    'Science',
    'Your Money',
    'The Learning Network',
    'Climate',
    'Health',
    'Theater',
    'Books',
    'Magazine',
    'Sports',
    'Fashion & Style',
    'T Magazine',
    'Technology',
    'Multimedia/Photos',

]

In [174]:
data = data[data['section_name'].isin(categories)]
data.dropna(subset=['print_section'], inplace=True)
data.dropna(subset=['snippet'], inplace=True)
data = data[data['keywords'] != '[]']

In [175]:
data[['negative', 'positive', 'sentiment_score']] = data['snippet'].apply(lambda row: predict_sentiment(model, tokenizer, row))

ValueError: Columns must be same length as key