In [58]:
from transformers import BertTokenizer, BertModel
import torch, os
from torch.utils.data import random_split
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
bs = 5120

In [62]:
class BlueSky(Dataset):
    def __init__(self, file, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model
        self.data = pd.read_csv(file)
        self.post_creation_time = self.data['time'].tolist()
        self.post_creation_time_min = min(self.post_creation_time)*1e9
        self.post_creation_time_max = max(self.post_creation_time)*1e9

    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        self.model.eval()
        item = self.data.iloc[idx]
        text = str(item['text'])
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = self.model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        text_embeddings = torch.tensor(cls_embedding.squeeze().cpu().numpy()).to(device)
        numerical_data = torch.tensor([item['follows_count'].item(), item['followers_count'].item()]).to(device)
        post_creation_time_normalized = torch.tensor([(item['time'].item()*1e9 - self.post_creation_time_min) / (self.post_creation_time_max - self.post_creation_time_min)]).to(device)
        out = torch.cat((text_embeddings, numerical_data, post_creation_time_normalized), dim=0).to(device)
        label = torch.tensor([item['likes'].item()]).to(device)
        return out, label

In [None]:
dataset = "model_data.csv"
data = BlueSky(dataset, tokenizer, bert_model)

train_dataset, test_dataset = random_split(data, [0.8, 0.2])

train_dl = DataLoader(train_dataset, batch_size = bs, shuffle=True)
test_dl = DataLoader(test_dataset, batch_size = bs, shuffle=True)

In [64]:
class RegressionModel(nn.Module):
    def __init__(self, input_size):
        super(RegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
input_size = next(iter(train_dl))[0].shape[1]
model = RegressionModel(input_size).to(device)

In [None]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
num_epochs = 50
for epoch in range(num_epochs):
    for batch in tqdm(train_dl):
        data, label = batch
        model.train()
        optimizer.zero_grad()
        pred = model(data)
        loss = criterion(pred, label.float())
        loss.backward()
        optimizer.step()
        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    
    for batch in test_dl:
        model.eval()
        data, label = batch
        with torch.no_grad():
            pred = model(data)
        test_loss = criterion(pred, label.float())
        print(f'Test Loss: {test_loss.item():.4f}')

In [None]:
torch.save(model.state_dict(), 'model.pth')