In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import pipeline

In [2]:
model_path = "../CLIP-GPT2/models/gpt2-large"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
tokenizer.save_pretrained(model_path)

model = GPT2LMHeadModel.from_pretrained("gpt2-large")
model.save_pretrained(model_path)

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

In [4]:
## DATA PREPERATION
def file_to_list(text_loc, tokenizer, max_len):
    text_blocks = []
    
    f = open(text_loc, encoding="utf-8")
    text = f.read()
    
    tokens = tokenizer.encode(text)
    
    while len(tokens) > 0:
        holder = []
        if len(tokens) > max_len:
            holder = tokens[0:max_len]
            del tokens[0:max_len]
            if holder[-1] != 50256:
                holder.append(50256)
        
        else:
            holder = tokens
            tokens = []
            if holder[-1] != 50256:
                holder.append(50256)
                
        text_blocks.append(tokenizer.decode(holder))
    
    return text_blocks

def text_to_pieces(text_loc, tokenizer, max_len=1024):
    text_blocks = file_to_list(text_loc, tokenizer, max_len)    
    return text_blocks

text_loc = "../datasets/AVA/AVA-captions_clean_full_text.txt"
max_len = 1024

data = text_to_pieces(text_loc, tokenizer, max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (31188355 > 1024). Running this sequence through the model will result in indexing errors


In [5]:
train, test = train_test_split(data, test_size=0.1, random_state=42)

In [6]:
train_loc = "../CLIP-GPT2/data/large/train.txt"
test_loc = "../CLIP-GPT2/data/large/test.txt"

In [7]:
train_text = ""
train_text = train_text.join(train)

test_text = ""
test_text = test_text.join(test)

with open(train_loc, "x", encoding="utf-8") as f:
    f.write(train_text)

with open(test_loc, "x", encoding="utf-8") as f:
    f.write(test_text)

# Restart Runtime

In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import pipeline

In [2]:
model_path = "../CLIP-GPT2/models/gpt2-large"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

In [3]:
def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=128,
    )
    test_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=test_path,
        block_size=128,
    )
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, test_dataset, data_collator

train_path = "../CLIP-GPT2/data/large/train.txt"
test_path = "../CLIP-GPT2/data/large/test.txt"
train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)



In [4]:
training_args = TrainingArguments(
    output_dir="../CLIP-GPT2/models/gpt2-large-AVA",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=1,
    eval_steps=10000,
    save_steps=10000,
    warmup_steps=500,
    fp16=True,
    fp16_opt_level="O1",
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Using cuda_amp half precision backend


In [5]:
trainer.train()

***** Running training *****
  Num examples = 219333
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 27417
  Number of trainable parameters = 774030080


  0%|          | 0/27417 [00:00<?, ?it/s]

{'loss': 3.636, 'learning_rate': 5e-05, 'epoch': 0.02}
{'loss': 3.4952, 'learning_rate': 4.907121893227329e-05, 'epoch': 0.04}
{'loss': 3.4521, 'learning_rate': 4.814243786454657e-05, 'epoch': 0.05}
{'loss': 3.4143, 'learning_rate': 4.7213656796819856e-05, 'epoch': 0.07}
{'loss': 3.3904, 'learning_rate': 4.628487572909314e-05, 'epoch': 0.09}
{'loss': 3.3747, 'learning_rate': 4.535609466136643e-05, 'epoch': 0.11}
{'loss': 3.3704, 'learning_rate': 4.442731359363971e-05, 'epoch': 0.13}
{'loss': 3.35, 'learning_rate': 4.3498532525913e-05, 'epoch': 0.15}
{'loss': 3.3434, 'learning_rate': 4.256975145818628e-05, 'epoch': 0.16}
{'loss': 3.3283, 'learning_rate': 4.164097039045956e-05, 'epoch': 0.18}
{'loss': 3.3286, 'learning_rate': 4.0712189322732844e-05, 'epoch': 0.2}
{'loss': 3.3247, 'learning_rate': 3.978340825500613e-05, 'epoch': 0.22}
{'loss': 3.2998, 'learning_rate': 3.885462718727942e-05, 'epoch': 0.24}
{'loss': 3.2964, 'learning_rate': 3.79258461195527e-05, 'epoch': 0.26}
{'loss': 3.29

Saving model checkpoint to ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-10000
Configuration saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-10000\config.json


{'loss': 3.2606, 'learning_rate': 3.235501727532786e-05, 'epoch': 0.36}


Model weights saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-10000\pytorch_model.bin


{'loss': 3.2604, 'learning_rate': 3.142623620760115e-05, 'epoch': 0.38}
{'loss': 3.2666, 'learning_rate': 3.049745513987443e-05, 'epoch': 0.4}
{'loss': 3.2629, 'learning_rate': 2.9570531634283166e-05, 'epoch': 0.42}
{'loss': 3.242, 'learning_rate': 2.8641750566556453e-05, 'epoch': 0.44}
{'loss': 3.2375, 'learning_rate': 2.771296949882974e-05, 'epoch': 0.46}
{'loss': 3.251, 'learning_rate': 2.6784188431103023e-05, 'epoch': 0.47}
{'loss': 3.241, 'learning_rate': 2.5857264925511758e-05, 'epoch': 0.49}
{'loss': 3.2374, 'learning_rate': 2.4928483857785045e-05, 'epoch': 0.51}
{'loss': 3.2233, 'learning_rate': 2.399970279005833e-05, 'epoch': 0.53}
{'loss': 3.2201, 'learning_rate': 2.3070921722331612e-05, 'epoch': 0.55}
{'loss': 3.2248, 'learning_rate': 2.214399821674035e-05, 'epoch': 0.57}
{'loss': 3.2208, 'learning_rate': 2.1215217149013637e-05, 'epoch': 0.58}
{'loss': 3.2228, 'learning_rate': 2.028643608128692e-05, 'epoch': 0.6}
{'loss': 3.2131, 'learning_rate': 1.9357655013560204e-05, 'epo

Saving model checkpoint to ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-20000
Configuration saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-20000\config.json


{'loss': 3.1949, 'learning_rate': 1.3790541293606272e-05, 'epoch': 0.73}


Model weights saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-20000\pytorch_model.bin


{'loss': 3.2123, 'learning_rate': 1.2861760225879557e-05, 'epoch': 0.75}
{'loss': 3.1939, 'learning_rate': 1.193297915815284e-05, 'epoch': 0.77}
{'loss': 3.1908, 'learning_rate': 1.1004198090426125e-05, 'epoch': 0.78}
{'loss': 3.1903, 'learning_rate': 1.007541702269941e-05, 'epoch': 0.8}
{'loss': 3.1844, 'learning_rate': 9.148493517108147e-06, 'epoch': 0.82}
{'loss': 3.1863, 'learning_rate': 8.219712449381432e-06, 'epoch': 0.84}
{'loss': 3.1891, 'learning_rate': 7.290931381654717e-06, 'epoch': 0.86}
{'loss': 3.1839, 'learning_rate': 6.362150313928002e-06, 'epoch': 0.88}
{'loss': 3.1807, 'learning_rate': 5.4333692462012855e-06, 'epoch': 0.89}
{'loss': 3.1783, 'learning_rate': 4.506445740610023e-06, 'epoch': 0.91}
{'loss': 3.1765, 'learning_rate': 3.5776646728833082e-06, 'epoch': 0.93}
{'loss': 3.1778, 'learning_rate': 2.6488836051565926e-06, 'epoch': 0.95}
{'loss': 3.172, 'learning_rate': 1.720102537429877e-06, 'epoch': 0.97}
{'loss': 3.1665, 'learning_rate': 7.913214697031617e-07, 'epo



Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 8455.5649, 'train_samples_per_second': 25.939, 'train_steps_per_second': 3.242, 'train_loss': 3.262228260095242, 'epoch': 1.0}


TrainOutput(global_step=27417, training_loss=3.262228260095242, metrics={'train_runtime': 8455.5649, 'train_samples_per_second': 25.939, 'train_steps_per_second': 3.242, 'train_loss': 3.262228260095242, 'epoch': 1.0})

# Reset Runtime

In [2]:
import os
import shutil
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import pipeline

In [3]:
model_path = "../CLIP-GPT2/models/gpt2-large"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

m_loc = "../CLIP-GPT2/models/gpt2-large-AVA/checkpoint-27417"
model = GPT2LMHeadModel.from_pretrained(m_loc)

In [4]:
prefix = "I like this image, but "
tokens = tokenizer.encode(prefix)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
output = pipe(prefix, max_new_tokens=50, num_return_sequences=5, pad_token_id=50256, num_beams=3)

output

[{'generated_text': "I like this image, but ive never seen one like this before. great work. good idea, but i think it would have been better with a different background. i don't really like the white background. i like the idea, but the white background is a little distracting."},
 {'generated_text': 'I like this image, but ive never seen it done before. i like the idea, but i think the lighting is a little flat. i like the idea, but the lighting is a bit flat. i like the idea, but i think the lighting is a little flat.'},
 {'generated_text': "I like this image, but ive never seen it before. it's a very interesting image. i think it would have been better if you had cropped out the bright light on the right side of the image. this is a really cool photo. i love the colors and the lighting"},
 {'generated_text': "I like this image, but ive never seen it done before. i'm not sure how you did it, but i like it. i hope you explain how you did it. i'm not sure how you did it, but i like it