In [1]:
!pip install transformers
!pip install wget

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/c1/bd/f64d67df4d3b05a460f281defe830ffab6d7940b7ca98ec085e94e024781/transformers-4.34.1-py3-none-any.whl.metadata
  Downloading transformers-4.34.1-py3-none-any.whl.metadata (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.5/121.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.16.4 from https://files.pythonhosted.org/packages/ef/b5/b6107bd65fa4c96fdf00e4733e2fe5729bb9e5e09997f63074bb43d3ab28/huggingface_hub-0.18.0-py3-none-any.whl.metadata
  Downloading huggingface_hub-0.18.0-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Obtaining dependency information for regex!=2019.12.17 from https://files.pythonhosted.org/packages/cd/98/999f0456bdb4124b3d0a7f1d8b6d50979536f5df9856e597580dd9a6d3f

In [43]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import json
import os
from collections import defaultdict

# Download movies_metadata from Onedrive pulbic link if not present in working directory
def download_data():
    if os.path.exists('movies_metadata.csv'):
        return
    with open('dataset_download_links.json') as f:
        download_links = json.load(f)

    print(download_links['movies_metadata_csv_url'])
    # Install wget if not present

    !wget -O movies_metadata.csv --no-check-certificate "{download_links['movies_metadata_csv_url']}"

download_data()

movies_data = pd.read_csv("movies_metadata.csv", usecols=['overview', 'production_countries', 'original_language', 'revenue', 'budget', 'imdb_id'])

box_office_dtypes = defaultdict(lambda: str)
box_office_dtypes['imdb_id'] = str

box_office_data = pd.read_csv("../box_office_collections.csv", dtype=box_office_dtypes).drop(columns=['Movie Name'])

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [44]:
# Clean movies_data: convert to following schema: overview: str, production_countries: str, original_language: str, revenue: float, budget: float
# If unable to convert to float, print the row and drop it
print("Shape before ", movies_data.shape)
movies_data['budget'] = pd.to_numeric(movies_data['budget'], errors='coerce', downcast='float')
movies_data['imdb_id'] = movies_data['imdb_id'].str[2:]
movies_data = movies_data.dropna(subset=['revenue', 'budget'], how='any')
print("Shape after ", movies_data.shape)
print(movies_data.dtypes)
print(movies_data.head())

# 1. Convert to following schema:
#  imdbId: str, skip Movie Name, all other columns are either empty or have a USD symbol followed by a number surrounded by double quotes. Convert these columns to float.
print("Shape before ", box_office_data.shape)
box_office_data[box_office_data.columns[1:]] = box_office_data[box_office_data.columns[1:]].replace('[\$,]', '', regex=True).astype(float)

print("Shape after ", box_office_data.shape)
print(box_office_data.dtypes)
print(box_office_data.head())


Shape before  (45466, 6)
Shape after  (45460, 6)
budget                  float64
imdb_id                  object
original_language        object
overview                 object
production_countries     object
revenue                 float64
dtype: object
       budget  imdb_id original_language  \
0  30000000.0  0114709                en   
1  65000000.0  0113497                en   
2         0.0  0113228                en   
3  16000000.0  0114885                en   
4         0.0  0113041                en   

                                            overview  \
0  Led by Woody, Andy's toys live happily in his ...   
1  When siblings Judy and Peter discover an encha...   
2  A family wedding reignites the ancient feud be...   
3  Cheated on, mistreated and stepped on, the wom...   
4  Just when George Banks has recovered from his ...   

                                production_countries      revenue  
0  [{'iso_3166_1': 'US', 'name': 'United States o...  373554033.0  
1  [{'i

In [47]:
merged_data = pd.merge(movies_data, box_office_data, how='left', left_on='imdb_id', right_on='imdbId')
print("Shape before ", merged_data.shape)
merged_data.head(20)

Shape before  (45460, 136)


Unnamed: 0,budget,imdb_id,original_language,overview,production_countries,revenue,imdbId,Argentina,Aruba,Australia,...,Guatemala,Netherlands Antilles,North Macedonia,South Africa/Nigeria,Switzerland (French/Italian),E/W Africa,Laos,Bosnia,Soviet Union,Malta
0,30000000.0,114709,en,"Led by Woody, Andy's toys live happily in his ...","[{'iso_3166_1': 'US', 'name': 'United States o...",373554033.0,114709.0,620584.0,,908790.0,...,,,,,,,,,,
1,65000000.0,113497,en,When siblings Judy and Peter discover an encha...,"[{'iso_3166_1': 'US', 'name': 'United States o...",262797249.0,113497.0,,,375.0,...,,,,,,,,,,
2,0.0,113228,en,A family wedding reignites the ancient feud be...,"[{'iso_3166_1': 'US', 'name': 'United States o...",0.0,,,,,...,,,,,,,,,,
3,16000000.0,114885,en,"Cheated on, mistreated and stepped on, the wom...","[{'iso_3166_1': 'US', 'name': 'United States o...",81452156.0,,,,,...,,,,,,,,,,
4,0.0,113041,en,Just when George Banks has recovered from his ...,"[{'iso_3166_1': 'US', 'name': 'United States o...",76578911.0,,,,,...,,,,,,,,,,
5,60000000.0,113277,en,"Obsessive master thief, Neil McCauley leads a ...","[{'iso_3166_1': 'US', 'name': 'United States o...",187436818.0,113277.0,,,,...,,,,,,,,,,
6,58000000.0,114319,en,An ugly duckling having undergone a remarkable...,"[{'iso_3166_1': 'DE', 'name': 'Germany'}, {'is...",0.0,114319.0,,,,...,,,,,,,,,,
7,0.0,112302,en,"A mischievous young boy, Tom Sawyer, witnesses...","[{'iso_3166_1': 'US', 'name': 'United States o...",0.0,,,,,...,,,,,,,,,,
8,35000000.0,114576,en,International action superstar Jean Claude Van...,"[{'iso_3166_1': 'US', 'name': 'United States o...",64350171.0,,,,,...,,,,,,,,,,
9,58000000.0,113189,en,James Bond must unmask the mysterious head of ...,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",352194034.0,113189.0,,,,...,,,,,,,,,,


In [None]:
# 2. Dataset and Dataloader
class RevenueDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=256, size=None):
        self.tokenizer = tokenizer
        self.max_length = max_length
        print(data.columns)
        data['budget'] = pd.to_numeric(data['budget'], errors='coerce')

        data = data.dropna(subset=['overview', 'revenue', 'production_countries', 'original_language', 'budget'])
        max_revenue = data['revenue'].max()
        min_revenue = data['revenue'].min()

        max_budget = data['budget'].max()
        min_budget = data['budget'].min()
        data['budget_normalized'] = (data['budget'] - min_budget) / (max_budget - min_budget)

        data['revenue_normalized'] = (data['revenue'] - min_revenue) / (max_revenue - min_revenue)

        if size:
            data = data.sample(n=size, random_state=42)

        self.data = data

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        inputs = self.tokenizer.encode_plus(row['overview'], add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')

        # Extract the country, this assumes that the data is formatted as [{'iso_3166_1': 'US', 'name': 'United States of America'}]
        try:
            country = eval(row['production_countries'])[0]['iso_3166_1']
            country = 1 if country == "US" else 0  # For simplicity, 1 if US, 0 otherwise
        except:
            country = 0

        # Extract language
        language = 1 if row['original_language'] == 'en' else 0  # 1 if English, 0 otherwise

        budget = torch.tensor(row['budget_normalized'], dtype=torch.float)

        other_features = torch.tensor([country, language, budget], dtype=torch.float)

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "other_features": other_features,
            "revenue": torch.tensor(row['revenue_normalized'], dtype=torch.float)
        }

    def __len__(self):
      return len(self.data)

In [None]:
dataset = RevenueDataset(tokenizer, data)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
# 3. Model
class RevenuePredictor(nn.Module):
    def __init__(self):
        super(RevenuePredictor, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")

        # Linear layer for textual embeddings
        self.linear_text = nn.Linear(self.bert.config.hidden_size, 128)

        # Linear layer for non-textual embeddings
        self.linear_other = nn.Linear(3, 128)  # 2 other features (country and language) for simplicity

        # Final output layer
        self.out = nn.Sequential(nn.Linear(256, 1), nn.ReLU())

    def forward(self, input_ids, attention_mask, other_features):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_embedding = self.linear_text(bert_out['pooler_output'])

        other_embedding = self.linear_other(other_features)

        combined = torch.cat([text_embedding, other_embedding], dim=1)
        output = self.out(combined)
        return output.squeeze()

model = RevenuePredictor()

In [None]:
# 4. Optimizer & Loss
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader) * 3)  # 3 epochs

In [None]:
# 5. Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
for epoch in range(3):
    model.train()
    total_loss = 0
    count = 0
    for batch in dataloader:
        optimizer.zero_grad()
        print("dataloader batch no", count)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        other_features = batch["other_features"].to(device)
        revenue = batch["revenue"].to(device)

        predictions = model(input_ids, attention_mask, other_features)
        loss = loss_fn(predictions, revenue)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        count+=1
        print("total loss", total_loss)

    print(f"Epoch {epoch + 1} | Loss: {total_loss / len(dataloader)}")

In [None]:
model.eval()

def denormalize_revenue(normalized_value):
    max_revenue = data['revenue'].max()
    min_revenue = data['revenue'].min()
    return normalized_value * (max_revenue - min_revenue) + min_revenue
# 3. Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 4. Test Function
def predict_revenue(prompt, other_features_input):  # Added other_features_input parameter
    # Tokenize the input prompt
    inputs = tokenizer.encode_plus(prompt, add_special_tokens=True, max_length=256, padding='max_length', truncation=True, return_tensors='pt')
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Convert other_features_input to a tensor and make sure it's of size (1, number_of_features)
    # other_features_tensor = torch.tensor([other_features_input]).to(device)
    other_features_tensor = torch.tensor([other_features_input], dtype=torch.float).to(device)

    # Predict
    with torch.no_grad():
        prediction = model(input_ids, attention_mask, other_features_tensor)
    return prediction.item()

# 5. Test the function
# prompt = "A romantic story about two star-crossed lovers set in a historical backdrop."
prompt = "When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys, and other terrifying creatures."

other_features_for_prompt = [1, 1, 40.9]  # Fill this list with the appropriate feature values for your prompt
# other_features_for_prompt = torch.tensor([other_features_input], dtype=torch.float).to(device)

predicted_revenue_normalized = predict_revenue(prompt, other_features_for_prompt)
predicted_revenue_actual = denormalize_revenue(predicted_revenue_normalized)

print(f"Predicted revenue for the movie: ${predicted_revenue_actual}")


In [None]:
max_budget = data['budget'].max()
min_budget = data['budget'].min()

normalized_budget = (65000000 - min_budget)/max_budget
print(normalized_budget)