In [None]:
!pip install transformers[torch]
!pip install numba
!pip install accelerate -U
!pip install wonderwords



## Libraries

In [None]:
import numpy as np
import pandas as pd
from wonderwords import RandomWord

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, T5ForConditionalGeneration, BertTokenizer, T5Tokenizer

from torch.utils.data import DataLoader, Dataset
from torch.utils.data import random_split

In [None]:
## Load datasets
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
model_checkpoints_path = '/content/drive/My Drive/ds266proj/model_checkpoints'

# Read data from to Google Drive
hs_all_data = pd.read_csv('/content/drive/My Drive/ds266proj/hs_cards_data_text.csv')
nr_all_data = pd.read_csv('/content/drive/My Drive/ds266proj/nr_cards_data_text.csv')
hs_fakes_data = pd.read_csv('/content/drive/My Drive/ds266proj/hs_fakes_withcols.csv')
nr_fakes_data = pd.read_csv('/content/drive/My Drive/ds266proj/nr_fakes_withcols.csv')

nr_fakes_data = nr_fakes_data[['description', 'faction', 'type']]
hs_fakes_data = hs_fakes_data[['description', 'class', 'type']]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
## Add new column for IDing
print(nr_fakes_data.head())
print(hs_fakes_data.head())

                                         description       faction    type
0  The card named Squida is a 3 advancement agend...  neutral-corp  agenda
1  The card named "The Legacy of the Nation" is a...  neutral-corp  agenda
2  The card named The Edge of the Community II is...  neutral-corp  agenda
3  The card named Grain is a 5 advancement agenda...  neutral-corp  agenda
4  The card named Interpolation is a 3 advancemen...  neutral-corp  agenda
                                         description   class   type
0  The card named Rift Reap is a 1 cost holy spel...  Priest  Spell
1  The card named Dreadspell is a 3 cost spell . ...  Priest  Spell
2  The card named Assault on Immortals is a 2 cos...  Priest  Spell
3  The card named Shadow Frozen is a 6-cost shado...  Priest  Spell
4  The card named Priest of the Night is a 2 cost...  Priest  Spell


In [None]:
## Create the labeled data
all_nr_desc = list(nr_all_data['description']) + list(nr_fakes_data['description'])
all_hs_desc = list(hs_all_data['description']) + list(hs_fakes_data['description'])

nr_labels = [1]*len(nr_all_data) + [0]*len(nr_fakes_data)
hs_labels = [1]*len(hs_all_data) + [0]*len(hs_fakes_data)

## NetRunner


### BERT Classifier

In [None]:
## Prepare the BERT model to be used for classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
## Tokenize the data
nr_tokenized = [tokenizer.encode(description, add_special_tokens=True, max_length=512, truncation=True) for description in all_nr_desc]
hs_tokenized = [tokenizer.encode(description, add_special_tokens=True, max_length=512, truncation=True) for description in all_hs_desc]

# Ensure all tokenized sequences have the same length by padding shorter sequences
max_seq_length_nr = max(len(seq) for seq in nr_tokenized)
padded_nr_tokenized = [seq + [0]*(max_seq_length_nr - len(seq)) for seq in nr_tokenized]
max_seq_length_hs = max(len(seq) for seq in hs_tokenized)
padded_hs_tokenized = [seq + [0]*(max_seq_length_hs - len(seq)) for seq in hs_tokenized]


In [None]:
## Create the dataloader and data to use with the loader
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
class DescriptionData(Dataset):
    def __init__(self, tokenized_descriptions, labels):
        self.tokenized_descriptions = tokenized_descriptions
        self.labels = labels

    def __len__(self):
        return len(self.tokenized_descriptions)

    def __getitem__(self, idx):
        return torch.tensor(self.tokenized_descriptions[idx]), torch.tensor(self.labels[idx])

nr_dataset = DescriptionData(padded_nr_tokenized, nr_labels)
hs_dataset = DescriptionData(padded_hs_tokenized, hs_labels)

## Prepare the training and test datasets
#+++++++++++++++++++++++++++++++++++++++++++++++++
total_size_nr = len(nr_dataset)
train_size_nr = int(0.7 * total_size_nr)
val_size_nr = total_size_nr - train_size_nr

total_size_hs = len(hs_dataset)
train_size_hs = int(0.7 * total_size_hs)
val_size_hs = total_size_hs - train_size_hs

# Split dataset into training and validation sets
train_nr_dataset, val_nr_dataset = random_split(nr_dataset, [train_size_nr, val_size_nr])
train_hs_dataset, val_hs_dataset = random_split(hs_dataset, [train_size_hs, val_size_hs])

# Create dataloaders for training and validation
train_nr_dataloader = DataLoader(train_nr_dataset, batch_size=32, shuffle=True)
val_nr_dataloader = DataLoader(val_nr_dataset, batch_size=32, shuffle=False)

train_hs_dataloader = DataLoader(train_hs_dataset, batch_size=32, shuffle=True)
val_hs_dataloader = DataLoader(val_hs_dataset, batch_size=32, shuffle=False)


In [None]:
## Create the classifier model
class CardDetectiveBERT(nn.Module):
    def __init__(self, bert_model, num_classes = 2):
        super(CardDetectiveBERT, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(bert_model.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        logits = self.fc(pooled_output)
        return logits

### BERT Traininer

In [None]:
## Instantiate a NR model
nr_bert_model = BertModel.from_pretrained('bert-base-uncased')
nr_classifier = CardDetectiveBERT(nr_bert_model)

In [None]:
## HS Model
hs_bert_model = BertModel.from_pretrained('bert-base-uncased')
hs_classifier = CardDetectiveBERT(hs_bert_model)

In [None]:
## Define the training loop -- NETRUNNER
## PyTorch code developed with some assistance for overall structure using ChatGPT
optimizer = torch.optim.Adam(nr_classifier.parameters(), lr=0.00002)
criterion = nn.CrossEntropyLoss()

## Use GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
nr_classifier.to(device)

num_epochs = 8

for epoch in range(num_epochs):
  nr_classifier.train()
  total_loss = 0

  for batch in train_nr_dataloader:
    ## Load batch with id labels and attention mask
    input_ids, labels = batch
    input_ids, labels = input_ids.to(device), labels.to(device)
    attention_mask = (input_ids != 0).long()

    outputs = nr_classifier(input_ids, attention_mask=attention_mask)
    loss = criterion(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  avg_loss = total_loss / len(train_nr_dataloader)
  print(f'Epoch {epoch+1}/{num_epochs}, Avg Batch Loss: {avg_loss:.4f}')

Epoch 1/8, Avg Batch Loss: 0.4819
Epoch 2/8, Avg Batch Loss: 0.2009
Epoch 3/8, Avg Batch Loss: 0.0995
Epoch 4/8, Avg Batch Loss: 0.0703
Epoch 5/8, Avg Batch Loss: 0.0346
Epoch 6/8, Avg Batch Loss: 0.0251
Epoch 7/8, Avg Batch Loss: 0.0147
Epoch 8/8, Avg Batch Loss: 0.0122


In [None]:
## Define the training loop -- HEARTHSTONE
optimizer = torch.optim.Adam(hs_classifier.parameters(), lr=0.00002)
criterion = nn.CrossEntropyLoss()

## Use GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
hs_classifier.to(device)

num_epochs = 8

for epoch in range(num_epochs):
  hs_classifier.train()
  total_loss = 0

  for batch in train_hs_dataloader:
    ## Load batch with id labels and attention mask
    input_ids, labels = batch
    input_ids, labels = input_ids.to(device), labels.to(device)
    attention_mask = (input_ids != 0).long()

    outputs = hs_classifier(input_ids, attention_mask=attention_mask)
    loss = criterion(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  avg_loss = total_loss / len(train_hs_dataloader)
  print(f'Epoch {epoch+1}/{num_epochs}, Avg Batch Loss: {avg_loss:.4f}')

Epoch 1/8, Avg Batch Loss: 0.5399
Epoch 2/8, Avg Batch Loss: 0.3860
Epoch 3/8, Avg Batch Loss: 0.1990
Epoch 4/8, Avg Batch Loss: 0.0965
Epoch 5/8, Avg Batch Loss: 0.0611
Epoch 6/8, Avg Batch Loss: 0.0291
Epoch 7/8, Avg Batch Loss: 0.0295
Epoch 8/8, Avg Batch Loss: 0.0231


In [None]:
## Save the pre-trained real/fake classifier
nr_class_path = model_checkpoints_path + "/GAN/NetRunner/CardDetectiveBERT.pth"
torch.save(nr_classifier.state_dict(), nr_class_path)

In [None]:
## Save the pre-trained real/fake classifier
hs_class_path = model_checkpoints_path + "/GAN/Hearthstone/CardDetectiveBERT.pth"
torch.save(hs_classifier.state_dict(), hs_class_path)

In [None]:
## Function to compute accuracy
def compute_accuracy(model, dataloader):
  model.eval()
  correct_predictions = 0
  total_predictions = 0

  with torch.no_grad():
    for batch in dataloader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)

        attention_mask = (input_ids != 0).long()

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs, 1)

        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

  accuracy = correct_predictions / total_predictions
  return accuracy

# Compute accuracy on the validation set
nr_accuracy = compute_accuracy(nr_classifier, val_nr_dataloader)
print(f'Accuracy on validation set, netrunner: {nr_accuracy:.4f}')
hs_accuracy = compute_accuracy(hs_classifier, val_hs_dataloader)
print(f'Accuracy on validation set, hearthstone: {hs_accuracy:.4f}')

Accuracy on validation set, netrunner: 0.9480
Accuracy on validation set, hearthstone: 0.9306


In [None]:
## Code to show some examples
def show_examples(model, dataloader, num_examples=5):
  model.eval()
  examples_shown = 0

  with torch.no_grad():
    for batch in dataloader:
      input_ids, labels = batch
      input_ids, labels = input_ids.to(device), labels.to(device)

      attention_mask = (input_ids != 0).long()

      outputs = model(input_ids, attention_mask=attention_mask)
      _, predicted = torch.max(outputs, 1)

      for i in range(len(predicted)):
        print(f'Example {examples_shown + 1}:')
        print(f'  Input: {tokenizer.decode(input_ids[i].cpu().numpy(), skip_special_tokens=True)}')
        print(f'  True Label: {"Real" if labels[i] == 1 else "Fake"}')
        print(f'  Predicted Label: {"Real" if predicted[i] == 1 else "Fake"}')
        print('-'*50)

        examples_shown += 1

        if examples_shown >= num_examples:
          return

# Show some example predictions
show_examples(nr_classifier, val_nr_dataloader)
show_examples(hs_classifier, val_hs_dataloader)

Example 1:
  Input: the card named bankroll is a 1 cost program that requires 1 memory with the subtypes. it has an influence requirement of 2. the card text says whenever you make a successful run, you may place 1 credit from the bank on bankroll. trash : take all credits from bankroll.
  True Label: Real
  Predicted Label: Real
--------------------------------------------------
Example 2:
  Input: the card named netchip is a 1 cost hardware with the subtypes chip, consumergrade. it has an influence requirement of 2. the card text says netchip can host a program with a memory cost less than or equal to the number of copies of netchip installed. the memory cost of the hosted program does not count against your memory limit. limit 6 per deck.
  True Label: Real
  Predicted Label: Real
--------------------------------------------------
Example 3:
  Input: the card named digital rights management is a 1 cost operation with the subtypes. it has an influence requirement of 1. the card text 

In [None]:
## Wrong predictions
def show_wrong_predictions(model, dataloader, num_examples=5):
    model.eval()
    wrong_examples = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, labels = batch
            input_ids, labels = input_ids.to(device), labels.to(device)

            attention_mask = (input_ids != 0).long()

            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs, 1)

            for i in range(len(predicted)):
                if predicted[i] != labels[i]:
                    wrong_examples.append({
                        'input': tokenizer.decode(input_ids[i].cpu().numpy(), skip_special_tokens=True),
                        'true_label': "Real" if labels[i] == 1 else "Fake",
                        'predicted_label': "Real" if predicted[i] == 1 else "Fake"
                    })

                if len(wrong_examples) >= num_examples:
                    return wrong_examples



In [None]:
# Get wrong predictions from the validation set
wrong_predictions = show_wrong_predictions(nr_classifier, val_nr_dataloader)

# Print the wrong predictions
for i, example in enumerate(wrong_predictions, 1):
    print(f'Example {i}:')
    print(f'  Input: {example["input"]}')
    print(f'  True Label: {example["true_label"]}')
    print(f'  Predicted Label: {example["predicted_label"]}')
    print('-'*50)

Example 1:
  Input: the card named breakthrough is a 4 advancement agenda worth 2 points with the subtypes agenda. it has an influence requirement of 2. the card text says as an additional cost to play this agenda, spend click. click : subroutine end your run.
  True Label: Fake
  Predicted Label: Real
--------------------------------------------------
Example 2:
  Input: the card named hosted security control is a 3 cost hardware with the subtypes security. it has an influence requirement of 2. the card text says this hardware may not have any effect upon this hardware.
  True Label: Fake
  Predicted Label: Real
--------------------------------------------------
Example 3:
  Input: the card named capstone is a 2 cost hardware with the subtypes. it has an influence requirement of 3. the card text says click : trash any number of cards from your grip. for each trashed card of which you have another copy installed, draw 1 card.
  True Label: Real
  Predicted Label: Fake
-----------------

In [None]:
# Get wrong predictions from the validation set
wrong_predictions = show_wrong_predictions(hs_classifier, val_hs_dataloader)

# Print the wrong predictions
for i, example in enumerate(wrong_predictions, 1):
    print(f'Example {i}:')
    print(f'  Input: {example["input"]}')
    print(f'  True Label: {example["true_label"]}')
    print(f'  Predicted Label: {example["predicted_label"]}')
    print('-'*50)

Example 1:
  Input: the card named bloodwok is a 2 cost weapon with 3 attack and 1 durability, and includes the effects deathrattle. the card text says : deathrattle : choose another minion. if your hero died this game, summon it.
  True Label: Fake
  Predicted Label: Real
--------------------------------------------------
Example 2:
  Input: the card named g. h. c. r. e. c. is a 1 cost spell, and includes the effects discover. the card text says : discover a wild boar who died this game.
  True Label: Fake
  Predicted Label: Real
--------------------------------------------------
Example 3:
  Input: the card named spot on the ceiling is a 4 cost spell. the card text says : gain 3 armor.
  True Label: Fake
  Predicted Label: Real
--------------------------------------------------
Example 4:
  Input: the card named windy totem is a 3 cost mech, beast minion with 3 health and 2 attack, and includes the effects trigger visual. the card text says : after a friendly totem dies, summon a new