In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Install and import libraries
# !pip install transformers
# !pip install datasets


# DistilGPT2

In [5]:
from IPython.display import display, FileLink
import shutil
import os

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Accelerate training using GPU
if torch.cuda.is_available():
    gpu_device = torch.device("cuda")
else:
    gpu_device = torch.device("cpu")

# Turnoff WANDB
os.environ["WANDB_DISABLED"] = "true"
# Load pre-trained model and tokenizer
model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config).to(gpu_device)

# Create the dataset
os.makedirs("/kaggle/working/cache", exist_ok=True)

def load_dataset(file_path):
    with open(file_path, "r") as f:
        text = f.read()
    return text

train_data = load_dataset("./data/BBC/training/business_gpt2/business_gpt2_truncated.txt")
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="./data/BBC/training/business_gpt2/business_gpt2_truncated.txt",
    block_size=128,
    cache_dir="/kaggle/working/cache",  # Add this line

)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

# Training loop
training_args = TrainingArguments(
    output_dir="./distilgpt2_trained",
    overwrite_output_dir=True,
    num_train_epochs=8,
    per_device_train_batch_size=3,  # Increase this value for faster training
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=5e-5,
    warmup_steps=50,
    weight_decay=0.01,
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 1915
  Num Epochs = 8
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 3
  Gradient Accumulation steps = 1
  Total optimization steps = 5112
  Number of trainable parameters = 81912576


Step,Training Loss



KeyboardInterrupt



# GPT2-Medium

In [6]:
from IPython.display import display, FileLink
import shutil
import os

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Accelerate training using GPU
if torch.cuda.is_available():
    gpu_device = torch.device("cuda")
else:
    gpu_device = torch.device("cpu")

# Turn off WANDB
os.environ["WANDB_DISABLED"] = "true"

# Load pre-trained model and tokenizer
model_name = "gpt2-medium"  # Change to "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model_med = GPT2LMHeadModel.from_pretrained(model_name, config=config).to(gpu_device)

# Create the dataset
os.makedirs("/kaggle/working/cache", exist_ok=True)

def load_dataset(file_path):
    with open(file_path, "r") as f:
        text = f.read()
    return text

train_data = load_dataset("./data/BBC/training/business_gpt2/business_gpt2_truncated.txt")
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="./data/BBC/training/business_gpt2/business_gpt2_truncated.txt",
    block_size=128,
    cache_dir="/kaggle/working/cache",
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

# Training loop
training_args = TrainingArguments(
    output_dir="./gpt2_medium_trained",
    overwrite_output_dir=True,
    num_train_epochs=8,  # Increase the number of training epochs
    per_device_train_batch_size=3,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=5e-5,
    warmup_steps=50,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model_med,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

loading file vocab.json from cache at C:\Users\dxrul/.cache\huggingface\hub\models--gpt2-medium\snapshots\425b0cc90498ac177aa51ba07be26fc2fea6af9d\vocab.json
loading file merges.txt from cache at C:\Users\dxrul/.cache\huggingface\hub\models--gpt2-medium\snapshots\425b0cc90498ac177aa51ba07be26fc2fea6af9d\merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None


Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

loading configuration file config.json from cache at C:\Users\dxrul/.cache\huggingface\hub\models--gpt2-medium\snapshots\425b0cc90498ac177aa51ba07be26fc2fea6af9d\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "predict_special_tokens": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
   

Downloading pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
def generate_sentence(model, tokenizer, input_triplet, device="cpu", max_length=100, temperature=0.1, no_repeat_ngram_size=2, num_words=50):
    input_text = input_triplet + " <==>"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    attention_mask = torch.ones_like(input_ids)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    model = model.to(device)
    output = model.generate(input_ids, 
                            max_length=max_length, 
                            num_return_sequences=1, 
                            attention_mask=attention_mask,
                            temperature=temperature,
                            no_repeat_ngram_size=no_repeat_ngram_size,
                            do_sample=True,
                            )
    generated_text = tokenizer.decode(output[0])
    generated_sentence = generated_text.replace(input_text, "").strip()

    # Limit the output length to num_words
    word_list = generated_sentence.split(" ")
    short_generated_sentence = " ".join(word_list[:num_words])
    # 
    first_sentence = short_generated_sentence.split(".")
    first_sentence = first_sentence[0]

    return first_sentence

input_triplet = "executives | appear | to have decided on how to deal with low - cost competitors"
generated_sentence = generate_sentence(model, tokenizer, input_triplet)
generated_sentence_med = generate_sentence(model_med, tokenizer, input_triplet)
print(generated_sentence)
print(generated_sentence_med)


# Save the models 

In [None]:
# Pack trained model and tokenizer
os.makedirs('distilgpt2_trained_pkg', exist_ok=True)
tokenizer.save_pretrained('distilgpt2_trained_pkg')
model.save_pretrained('distilgpt2_trained_pkg')

# Compress the model and tokenizer
shutil.make_archive("distilgpt2_trained_pkg", 'zip', 'distilgpt2_trained_pkg')

# Create a link to download the compressed model and tokenizer
display(FileLink("distilgpt2_trained_pkg.zip"))

In [None]:
# Save the model and tokenizer
os.makedirs("gpt2_medium_trained", exist_ok=True)
tokenizer.save_pretrained("gpt2_medium_trained")
model_med.save_pretrained("gpt2_medium_trained")

shutil.make_archive("gpt2_medium_trained_pkg", 'zip', "gpt2_medium_trained")

FileLink("gpt2_medium_trained_pkg.zip")

