In [1]:
!pip install transformers
!pip install -U "ray[tune]"



# Imports

In [2]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from matplotlib import pyplot as plt
from ray import tune


# Loading prepared train and test data

In [3]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')
train_data['overview'].fillna('', inplace=True)
test_data['overview'].fillna('', inplace=True)

In [4]:
# 2. Dataset and Dataloader
class RevenueDataset(Dataset):
    def __init__(self, tokenizer, data, device, max_length=256):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = data
        self.original_language_cols = [x for x in data.columns if x.startswith('original_language_')]
        self.genre_cols = [x for x in data.columns if x.startswith('genre_')]
        self.cast_cols = [x for x in data.columns if x.startswith('cast_')]
        self.crew_cols = [x for x in data.columns if x.startswith('crew_')]
        self.device = device

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        inputs = self.tokenizer.encode_plus(row['overview'], add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt').to(self.device)

        original_language = torch.tensor(row[self.original_language_cols].values.astype(float), dtype=torch.float, device=self.device)
        genres = torch.tensor(row[self.genre_cols].values.astype(float), dtype=torch.float, device=self.device)
        cast = torch.tensor(row[self.cast_cols].values.astype(float), dtype=torch.float, device=self.device)
        crew = torch.tensor(row[self.crew_cols].values.astype(float), dtype=torch.float, device=self.device)
        budget = torch.tensor(row['budget_100M'], dtype=torch.float, device=self.device)
        budget_unknown = torch.tensor(row['budget_unknown'], dtype=torch.float, device=self.device)
        revenue = torch.tensor(row['revenue_100M'], dtype=torch.float, device=self.device)

        x = torch.cat((
            inputs["input_ids"].squeeze(),
            inputs["attention_mask"].squeeze(),
            original_language,
            genres,
            cast,
            crew,
            budget.unsqueeze(0),
            budget_unknown.unsqueeze(0)
        ))

        return x, revenue

    def __len__(self):
      return len(self.data)

In [5]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [7]:
train_dataset = RevenueDataset(tokenizer, train_data, DEVICE)
test_dataset = RevenueDataset(tokenizer, test_data, DEVICE)

In [8]:
NUM_GENRES = len(train_dataset.genre_cols)
NUM_CAST = len(train_dataset.cast_cols)
NUM_CREW = len(train_dataset.crew_cols)
NUM_ORIGINAL_LANGUAGES = len(train_dataset.original_language_cols)

In [9]:
# 3. Model
class RevenuePredictor(nn.Module):
    def __init__(self, bert_embedding_size = 128, original_language_embedding_size = 32, cast_embedding_size = 32, crew_embedding_size = 32, hidden_size = 256):
        super(RevenuePredictor, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")

        # Linear layer for textual embeddings
        self.linear_overview = nn.Linear(self.bert.config.hidden_size, bert_embedding_size)

        # Linear layer for original language embeddings
        self.linear_original_language = nn.Linear(NUM_ORIGINAL_LANGUAGES, original_language_embedding_size)

        # Linear layer for embedding cast
        self.linear_cast = nn.Linear(NUM_CAST, cast_embedding_size)

        # Linear layer for embedding crew
        self.linear_crew = nn.Linear(NUM_CREW, crew_embedding_size)

        # Budget and budget_unknown, and genres
        self.other_features_size = 2 + NUM_GENRES

        self.output_layer = nn.Sequential(
            nn.Linear(bert_embedding_size + original_language_embedding_size + cast_embedding_size + crew_embedding_size + self.other_features_size, hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_size, 1)
        )

    def forward(self, input):
        bert_out = self.bert(input_ids=input[:, :256].long(), attention_mask=input[:, 256:512].long())
        overview_embedding = self.linear_overview(bert_out['pooler_output'])

        original_language_embedding = self.linear_original_language(input[:, 512:512+NUM_ORIGINAL_LANGUAGES])
        cast_embedding = self.linear_cast(input[:, 512+NUM_ORIGINAL_LANGUAGES:512+NUM_ORIGINAL_LANGUAGES+NUM_CAST])
        crew_embedding = self.linear_crew(input[:, 512+NUM_ORIGINAL_LANGUAGES+NUM_CAST:512+NUM_ORIGINAL_LANGUAGES+NUM_CAST+NUM_CREW])
        other_features = input[:, 512+NUM_ORIGINAL_LANGUAGES+NUM_CAST+NUM_CREW:]


        return self.output_layer(torch.cat((
            overview_embedding,
            original_language_embedding,
            cast_embedding,
            crew_embedding,
            other_features
        ), dim=1))

In [10]:
def raytune_objective(config):
    batch_size = config['batch_size']

    train_dataloader = DataLoader(config['train_dataset'], batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(config['test_dataset'], batch_size=batch_size, shuffle=True)

    NUM_EPOCHS = 10

    model = RevenuePredictor(
        bert_embedding_size=config['bert_embedding_size'],
        original_language_embedding_size=config['original_language_embedding_size'],
        cast_embedding_size=config['cast_embedding_size'],
        crew_embedding_size=config['crew_embedding_size'],
        hidden_size=config['hidden_size']
    )
    optimizer = AdamW(model.parameters(), lr=config['adamw_lr'])
    loss_fn = nn.MSELoss()
    SCHEDULER_STEPS = NUM_EPOCHS * len(train_dataloader)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=250, num_training_steps=SCHEDULER_STEPS)
    model.to(DEVICE)

    best_mse = float('inf')
    history = []

    for epoch in range(NUM_EPOCHS):
        print("Started epoch ", epoch + 1)
        model.train()
        for x, y in train_dataloader:
            optimizer.zero_grad()

            predictions = model(x)
            loss = loss_fn(predictions, y.unsqueeze(1))

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            scheduler.step()

            # loop.set_postfix(mseloss=loss.item())

        # Evaluate on test set
        model.eval()
        preds = []
        actuals = []
        with torch.no_grad():
            for x, y in test_dataloader:
                predictions = model(x)
                preds.extend(predictions.squeeze().tolist())
                actuals.extend(y.tolist())
        mse = loss_fn(torch.tensor(preds).unsqueeze(1), torch.tensor(actuals).unsqueeze(1)).item()
        print("MSE on test set after epoch ", epoch + 1, ": ", mse)
        history.append(mse)
        if mse < best_mse:
            best_mse = mse
    print(history)
    # Plot loss history
    plt.plot(history)
    plt.xlabel('Epoch')
    plt.ylabel('MSE Loss')
    plt.title('Loss curve')
    plt.show()
    return {'best_mse': best_mse}

In [11]:
search_space = {
    'batch_size': tune.choice([8, 16, 32]),
    'bert_embedding_size': tune.choice([64, 128, 256]),
    'original_language_embedding_size': tune.choice([16, 32]),
    'cast_embedding_size': tune.choice([16, 32]),
    'crew_embedding_size': tune.choice([16, 32]),
    'hidden_size': tune.choice([128, 256]),
    'adamw_lr': tune.loguniform(5e-5, 1e-3),
    'train_dataset': train_dataset,
    'test_dataset': test_dataset
}

tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(raytune_objective),
        resources={"cpu": 2, "gpu": 1}
    ),
    param_space=search_space, tune_config=tune.TuneConfig(num_samples=5)
)

In [None]:
tune_results = tuner.fit()

2023-11-30 19:13:55,452	INFO worker.py:1673 -- Started a local Ray instance.
2023-11-30 19:13:57,990	INFO tune.py:220 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.
2023-11-30 19:13:57,994	INFO tune.py:595 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


+--------------------------------------------------------------------------+
| Configuration for experiment     raytune_objective_2023-11-30_19-13-51   |
+--------------------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator                   |
| Scheduler                        FIFOScheduler                           |
| Number of trials                 5                                       |
+--------------------------------------------------------------------------+

View detailed results here: /root/ray_results/raytune_objective_2023-11-30_19-13-51
To visualize your results with TensorBoard, run: `tensorboard --logdir /root/ray_results/raytune_objective_2023-11-30_19-13-51`

Trial status: 5 PENDING
Current time: 2023-11-30 19:15:06. Total running time: 1min 6s
Logical resource usage: 0/2 CPUs, 0/1 GPUs
+--------------------------------------------------------------------------------------------------------------------



[36m(raytune_objective pid=10288)[0m Started epoch  1
Trial status: 1 RUNNING | 4 PENDING
Current time: 2023-11-30 19:18:10. Total running time: 4min 10s
Logical resource usage: 2.0/2 CPUs, 1.0/1 GPUs
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                      status       batch_size     bert_embedding_size     ...ge_embedding_size     cast_embedding_size     crew_embedding_size     hidden_size      adamw_lr |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| raytune_objective_9d159_00000   RUNNING              32                     128                       16                      32                      32             128   8.22419e-05 |
| raytune_objective_9d159_00001   PENDING        