In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Load the preprocessed dataset
preproc_data = pd.read_csv('../data/preproc_data.csv')  # Replace 'your_dataset.csv' with the actual dataset file
preproc_data = preproc_data.dropna()

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(preproc_data, test_size=0.2, random_state=42)

# Load the pre-trained transformer model and tokenizer
model_name = "google/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformer_model = AutoModel.from_pretrained(model_name)

# Define the popularity prediction model
class PopularityPredictionModel(nn.Module):
    def __init__(self, transformer_model):
        super(PopularityPredictionModel, self).__init__()
        self.transformer = transformer_model
        self.fc_layer = nn.Linear(self.transformer.config.hidden_size, self.transformer.config.hidden_size)
        self.activation = nn.Tanh()
        self.regressor_likes = nn.Linear(self.transformer.config.hidden_size, 1)
        self.regressor_views = nn.Linear(self.transformer.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # Extract the CLS token embedding
        fc_output = self.activation(self.fc_layer(cls_embedding))  # Apply fully-connected layer with activation
        likes_prediction = self.regressor_likes(fc_output)  # Predict the number of likes
        views_prediction = self.regressor_views(fc_output)  # Predict the number of views
        return likes_prediction, views_prediction

# Initialize the popularity prediction model
popularity_model = PopularityPredictionModel(transformer_model)

# Train the model using train_df and validate using val_df (You'll need to define the training loop)

# Define a custom dataset class
class NewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        title = self.data.iloc[idx]['title']
        encoding = self.tokenizer(title, return_tensors="pt", padding="max_length", truncation=True)
        likes = self.data.iloc[idx]['likes']
        views = self.data.iloc[idx]['views']
        return {'input_ids': encoding['input_ids'], 'attention_mask': encoding['attention_mask'], 'likes': likes, 'views': views}

# Create instances of the custom dataset for training and validation
train_dataset = NewsDataset(train_df, tokenizer)
val_dataset = NewsDataset(val_df, tokenizer)

# Define the loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error Loss
optimizer = torch.optim.Adam(popularity_model.parameters(), lr=1e-5)

# Define the training loop
def train_model(model, train_loader, criterion, optimizer, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch in train_loader:
            input_ids, attention_mask, likes, views = batch['input_ids'], batch['attention_mask'], batch['likes'], batch['views']
            optimizer.zero_grad()
            likes_pred, views_pred = model(input_ids, attention_mask)
            loss = criterion(likes_pred, likes.float()) + criterion(views_pred, views.float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1} - Loss: {running_loss / len(train_loader)}")

# Create data loaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Train the model
train_model(popularity_model, train_loader, criterion, optimizer, num_epochs=5)

# Example usage:
input_headline = "Your news headline goes here"
input_encoding = tokenizer(input_headline, return_tensors="pt", padding="max_length", truncation=True)
likes_pred, views_pred = popularity_model(input_encoding["input_ids"], input_encoding["attention_mask"])


OSError: google/all-mpnet-base-v2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.