In [1]:
!pip install transformers



In [2]:
!pip install -U PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [3]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [5]:
import pandas as pd
import numpy as np
import re
from PyPDF2 import PdfReader
import os
import docx

In [6]:
# Functions to read different file types
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        elif filename.endswith(".docx"):
            combined_text += read_word(file_path)
        elif filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text

In [11]:
from google.colab import files
uploaded = files.upload()

Saving shakespeare.txt to shakespeare.txt


In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
# Provide the path to the single text file
file_path = '/content/drive/MyDrive/Colab Notebooks/data/shakespeare.txt'

# Use read_txt() to read the content of the text file
text_data = read_txt(file_path)
text_data = re.sub(r'\n+', '\n', text_data).strip()  # Remove excess newline characters

In [23]:
with open("/content/drive/MyDrive/Colab Notebooks/data/shakespeare.txt", "w") as f:
    f.write(text_data)

In [24]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [25]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [26]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [27]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [29]:
train_file_path = "/content/drive/MyDrive/Colab Notebooks/data/shakespeare.txt"
model_name = 'gpt2'
output_dir = '/content/drive/MyDrive/Colab Notebooks/models/prodigy1'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50.0
save_steps = 50000

In [33]:
import os

# Disable WandB completely
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Train
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
500,3.6741
1000,3.3519
1500,3.1313
2000,2.9567
2500,2.7749
3000,2.6115
3500,2.4621
4000,2.3069
4500,2.1866
5000,2.0438


In [36]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [37]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):

    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [38]:
model1_path = "/content/drive/MyDrive/Colab Notebooks/models/prodigy1"
sequence1 = "Shakespeare "
max_len = 50
generate_text(model1_path, sequence1, max_len)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Shakespeare urchins, and all despair
That might have revolted his tenderness.
QUEEN ELIZABETH:
Madam, 'tis so: to my poor contents
I have consumed all my valour: therefore


In [40]:
model1_path = "/content/drive/MyDrive/Colab Notebooks/models/prodigy1"
sequence1 = "Once upon a time "
max_len = 50
generate_text(model1_path, sequence1, max_len)

Once upon a time urchins me.
BIONDELLO:
Now, by Saint Jamy,
You know my fortunes: on the first,
With his son-in-law, Angelo, my brother,
With


In [46]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model1_path = "/content/drive/MyDrive/Colab Notebooks/models/prodigy1"
sequence1 = "I have won"
max_len = 100  # Ensure `max_len` is defined

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model1_path)
model = AutoModelForCausalLM.from_pretrained(model1_path)

# Assign pad token if it's missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set EOS token as pad token

# Tokenize the input sequence with padding and attention mask
inputs = tokenizer(sequence1, return_tensors="pt", padding=True, truncation=True)

# Generate the output sequence
output = model.generate(
    inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    max_length=max_len,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    early_stopping=True
)

# Decode the generated text to a readable format
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I have won, my liege; and I will return to London
The queen your obedient servant will deliver.
KING RICHARD III:
Ay, ay, and deliver that lie, that here king,
With all the rest revolted faction, from hence
I leave you. Sir William Brandon, you shall bear
My standard. You shall be new comer
Than when I arrive here, when this throne
Was crown'd by your father. As for you,
