In [1]:
%pip install transformers
%pip install datasets
%pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 28.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 69.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 47.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 29.2 M

In [2]:
# Some setup to make wordwrap work
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)


In [3]:
# Get data
from google.colab import drive
from datasets import load_from_disk

drive.mount('/content/gdrive')

subreddit = "subreddit-wallstreetbets"

datasets = load_from_disk(f"/content/gdrive/My Drive/10617_project/train_test_{subreddit}")


Mounted at /content/gdrive


In [4]:
# Some basic data analysis on our utterances
import pandas as pd
df = pd.DataFrame([len(utterance["utterance"]) for utterance in datasets["train"]])
df.describe()

KeyboardInterrupt: ignored

In [None]:
from transformers import GPT2LMHeadModel
from transformers import GPT2Tokenizer
from transformers import AutoTokenizer

model = GPT2LMHeadModel.from_pretrained("gpt2")

# tokenizer = GPT2Tokenizer.from_pretrained("gpt2", padding_side='left')
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
context_length = 32
# TODO: figure out padding

def tokenization(utterance):
    outputs = tokenizer(utterance["utterance"], 
                         truncation=True,
                         max_length=context_length,
                         return_overflowing_tokens=True,
                         return_length=True)
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

                     

tokenized_datasets = datasets.map(tokenization, batched=True, remove_columns=datasets["train"].column_names)

In [94]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 842
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 112
    })
})

In [95]:
# Fine-tuning

model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")


GPT-2 size: 124.4M parameters


In [96]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) # use CLM

out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 16])
attention_mask shape: torch.Size([5, 16])
labels shape: torch.Size([5, 16])


In [None]:
def visualize_input_output(inputs, outputs, full_inputs):
  for idx, (input, output, actual) in enumerate(zip(inputs, outputs, full_inputs)):
    print(f"[Input]\n", input)
    print(f"[Output]\n", output)
    print(f"[Actual]\n", actual)
    print("=" * 300)

decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
visualize_input_output(inputs, decoded_outputs, utterances_to_use)




In [97]:
from huggingface_hub import notebook_login

notebook_login()


Login successful
Your token has been saved to /root/.huggingface/token


In [100]:
from transformers import Trainer, TrainingArguments

output_dir = f"/content/gdrive/My Drive/10617_project/model_output_{subreddit}"

args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

trainer.train()


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Cloning https://huggingface.co/fanpu/model_output_subreddit-wallstreet into local empty directory.
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 842
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 8
  Total optimization steps = 3


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3, training_loss=4.906888008117676, metrics={'train_runtime': 2.2556, 'train_samples_per_second': 373.288, 'train_steps_per_second': 1.33, 'total_flos': 6271008768000.0, 'train_loss': 4.906888008117676, 'epoch': 0.89})

In [101]:
trainer.push_to_hub()


Saving model checkpoint to /content/gdrive/My Drive/10617_project/model_output_subreddit-wallstreet
Configuration saved in /content/gdrive/My Drive/10617_project/model_output_subreddit-wallstreet/config.json
Model weights saved in /content/gdrive/My Drive/10617_project/model_output_subreddit-wallstreet/pytorch_model.bin
tokenizer config file saved in /content/gdrive/My Drive/10617_project/model_output_subreddit-wallstreet/tokenizer_config.json
Special tokens file saved in /content/gdrive/My Drive/10617_project/model_output_subreddit-wallstreet/special_tokens_map.json


KeyboardInterrupt: ignored

In [None]:
# Evaluate it

import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model="gpt2", device=torch.cuda.current_device()
)

In [110]:
txt = """\
So what are your thoughts right now about the stock market? I think
"""
pipe(txt, num_return_sequences=1)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "So what are your thoughts right now about the stock market? I think\n\nIt looks to be healthy, and the stock market is\n\nstill undervaluing. The next stage is\n\ngetting the economy up again, that's what\n"}]

In [107]:
torch.cuda.current_device()

0

In [None]:
from transformers import GPT2Tokenizer
import torch
import random
import numpy as np

if 1 == 0:
  def generate_input_to_continue(utterance):
    tokens = utterance.split()
    return " ".join(tokens[: random.randint(len(tokens)//2, len(tokens))])
    
  
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2", padding_side='left')
  tokenizer.pad_token = tokenizer.eos_token
  
  utterances_to_use = np.random.choice(filtered_utterances, 10)
  inputs = list(map(generate_input_to_continue, utterances_to_use))
  model_inputs = tokenizer(inputs, padding="longest", return_tensors="pt")

In [None]:
# model_inputs
# outputs = model.generate(**model_inputs, max_length=100, do_sample=True)


In [None]:
from transformers import GPT2Tokenizer
import torch
import random
import numpy as np
from datasets import Dataset

# Tokenize, and split into train and test
# test_percentage = 10

tokenizer = GPT2Tokenizer.from_pretrained("gpt2", padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

np.random.seed(10617)
np.random.shuffle(filtered_utterances)
dataset_length = len(filtered_utterances)
len_train =  int(dataset_length * (100 - test_percentage) / 100.)

tokenized_datasets = {
  "train" : tokenizer(filtered_utterances[:len_train], padding="longest", return_tensors="pt"),
  "valid" : tokenizer(filtered_utterances[len_train:], padding="longest", return_tensors="pt")
}