In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import pipeline

Used code from: https://rowlando13.medium.com/everything-gpt-2-5-fine-tuning-885aec508c4

In [2]:
model_path = "../CLIP-GPT2/models/gpt2-large"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
tokenizer.save_pretrained(model_path)

model = GPT2LMHeadModel.from_pretrained("gpt2-large")
model.save_pretrained(model_path)

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

In [4]:
## DATA PREPERATION
def file_to_list(text_loc, tokenizer, max_len):
    text_blocks = []
    
    f = open(text_loc, encoding="utf-8")
    text = f.read()
    
    tokens = tokenizer.encode(text)
    
    while len(tokens) > 0:
        holder = []
        if len(tokens) > max_len:
            holder = tokens[0:max_len]
            del tokens[0:max_len]
            if holder[-1] != 50256:
                holder.append(50256)
        
        else:
            holder = tokens
            tokens = []
            if holder[-1] != 50256:
                holder.append(50256)
                
        text_blocks.append(tokenizer.decode(holder))
    
    return text_blocks

def text_to_pieces(text_loc, tokenizer, max_len=1024):
    text_blocks = file_to_list(text_loc, tokenizer, max_len)    
    return text_blocks

text_loc = "../datasets/AVA/AVA-captions_clean_full_text.txt"
max_len = 1024

data = text_to_pieces(text_loc, tokenizer, max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (31188355 > 1024). Running this sequence through the model will result in indexing errors


In [5]:
train, test = train_test_split(data, test_size=0.1, random_state=42)

In [6]:
train_loc = "../CLIP-GPT2/data/large/train.txt"
test_loc = "../CLIP-GPT2/data/large/test.txt"

In [7]:
train_text = ""
train_text = train_text.join(train)

test_text = ""
test_text = test_text.join(test)

with open(train_loc, "x", encoding="utf-8") as f:
    f.write(train_text)

with open(test_loc, "x", encoding="utf-8") as f:
    f.write(test_text)

# Restart Runtime

In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import pipeline

In [2]:
model_path = "../CLIP-GPT2/models/gpt2-large"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

In [3]:
def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=128,
    )
    test_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=test_path,
        block_size=128,
    )
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, test_dataset, data_collator

train_path = "../CLIP-GPT2/data/large/train.txt"
test_path = "../CLIP-GPT2/data/large/test.txt"
train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)



In [4]:
training_args = TrainingArguments(
    output_dir="../CLIP-GPT2/models/gpt2-large-AVA",
    overwrite_output_dir=False,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=1,
    eval_steps=5000,
    save_steps=10000,
    warmup_steps=500,
    fp16=True,
    fp16_opt_level="O1",
    prediction_loss_only=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Using cuda_amp half precision backend


In [5]:
trainer.train(resume_from_checkpoint=True) 

Loading model from ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-20000.
***** Running training *****
  Num examples = 219333
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 82251
  Number of trainable parameters = 774030080
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 20000
  Will skip the first 0 epochs then the first 20000 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/82251 [00:00<?, ?it/s]

{'loss': 3.2334, 'learning_rate': 3.7772015021222984e-05, 'epoch': 0.75}
{'loss': 3.2254, 'learning_rate': 3.746620836442368e-05, 'epoch': 0.77}
{'loss': 3.2262, 'learning_rate': 3.7160401707624375e-05, 'epoch': 0.78}
{'loss': 3.228, 'learning_rate': 3.685459505082507e-05, 'epoch': 0.8}
{'loss': 3.2226, 'learning_rate': 3.654878839402576e-05, 'epoch': 0.82}
{'loss': 3.2257, 'learning_rate': 3.624359335054006e-05, 'epoch': 0.84}
{'loss': 3.2301, 'learning_rate': 3.593778669374075e-05, 'epoch': 0.86}
{'loss': 3.2246, 'learning_rate': 3.563198003694145e-05, 'epoch': 0.88}
{'loss': 3.2214, 'learning_rate': 3.532617338014214e-05, 'epoch': 0.89}
{'loss': 3.219, 'learning_rate': 3.5020366723342835e-05, 'epoch': 0.91}
{'loss': 3.2168, 'learning_rate': 3.471456006654353e-05, 'epoch': 0.93}
{'loss': 3.2179, 'learning_rate': 3.4408753409744226e-05, 'epoch': 0.95}
{'loss': 3.2115, 'learning_rate': 3.410294675294492e-05, 'epoch': 0.97}
{'loss': 3.2056, 'learning_rate': 3.3797751709459214e-05, 'epoc

Saving model checkpoint to ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-30000
Configuration saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-30000\config.json


{'loss': 3.0534, 'learning_rate': 3.196352338197698e-05, 'epoch': 1.09}


Model weights saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-30000\pytorch_model.bin


{'loss': 3.0554, 'learning_rate': 3.165771672517768e-05, 'epoch': 1.11}
{'loss': 3.0569, 'learning_rate': 3.135191006837837e-05, 'epoch': 1.13}
{'loss': 3.0541, 'learning_rate': 3.104671502489266e-05, 'epoch': 1.15}
{'loss': 3.0515, 'learning_rate': 3.074090836809336e-05, 'epoch': 1.17}
{'loss': 3.0587, 'learning_rate': 3.043510171129405e-05, 'epoch': 1.19}
{'loss': 3.0526, 'learning_rate': 3.0129295054494748e-05, 'epoch': 1.2}
{'loss': 3.0477, 'learning_rate': 2.9824100011009044e-05, 'epoch': 1.22}
{'loss': 3.0596, 'learning_rate': 2.9518293354209737e-05, 'epoch': 1.24}
{'loss': 3.0616, 'learning_rate': 2.921248669741043e-05, 'epoch': 1.26}
{'loss': 3.0613, 'learning_rate': 2.890729165392472e-05, 'epoch': 1.28}
{'loss': 3.0536, 'learning_rate': 2.8601484997125417e-05, 'epoch': 1.29}
{'loss': 3.0499, 'learning_rate': 2.8295678340326116e-05, 'epoch': 1.31}
{'loss': 3.0568, 'learning_rate': 2.7989871683526808e-05, 'epoch': 1.33}
{'loss': 3.0621, 'learning_rate': 2.76840650267275e-05, 'ep

Saving model checkpoint to ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-40000
Configuration saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-40000\config.json


{'loss': 3.0491, 'learning_rate': 2.584983669924527e-05, 'epoch': 1.46}


Model weights saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-40000\pytorch_model.bin


{'loss': 3.0637, 'learning_rate': 2.5544641655759567e-05, 'epoch': 1.48}
{'loss': 3.0558, 'learning_rate': 2.523883499896026e-05, 'epoch': 1.5}
{'loss': 3.0693, 'learning_rate': 2.493302834216095e-05, 'epoch': 1.51}
{'loss': 3.0586, 'learning_rate': 2.4627221685361647e-05, 'epoch': 1.53}
{'loss': 3.047, 'learning_rate': 2.4321415028562342e-05, 'epoch': 1.55}
{'loss': 3.0601, 'learning_rate': 2.4015608371763038e-05, 'epoch': 1.57}
{'loss': 3.0444, 'learning_rate': 2.3709801714963733e-05, 'epoch': 1.59}
{'loss': 3.0513, 'learning_rate': 2.3403995058164426e-05, 'epoch': 1.6}
{'loss': 3.0502, 'learning_rate': 2.309880001467872e-05, 'epoch': 1.62}
{'loss': 3.0457, 'learning_rate': 2.2792993357879417e-05, 'epoch': 1.64}
{'loss': 3.0451, 'learning_rate': 2.248718670108011e-05, 'epoch': 1.66}
{'loss': 3.0535, 'learning_rate': 2.2181991657594402e-05, 'epoch': 1.68}
{'loss': 3.0484, 'learning_rate': 2.1876185000795098e-05, 'epoch': 1.7}
{'loss': 3.0464, 'learning_rate': 2.1570378343995793e-05, '

Saving model checkpoint to ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-50000
Configuration saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-50000\config.json


{'loss': 3.0386, 'learning_rate': 1.9736761629827157e-05, 'epoch': 1.82}


Model weights saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-50000\pytorch_model.bin


{'loss': 3.0491, 'learning_rate': 1.9430954973027853e-05, 'epoch': 1.84}
{'loss': 3.0452, 'learning_rate': 1.912514831622855e-05, 'epoch': 1.86}
{'loss': 3.044, 'learning_rate': 1.8819341659429244e-05, 'epoch': 1.88}
{'loss': 3.0354, 'learning_rate': 1.851353500262994e-05, 'epoch': 1.9}
{'loss': 3.0437, 'learning_rate': 1.8207728345830632e-05, 'epoch': 1.91}
{'loss': 3.035, 'learning_rate': 1.7901921689031327e-05, 'epoch': 1.93}
{'loss': 3.036, 'learning_rate': 1.7596115032232023e-05, 'epoch': 1.95}
{'loss': 3.0398, 'learning_rate': 1.729030837543272e-05, 'epoch': 1.97}
{'loss': 3.0367, 'learning_rate': 1.698450171863341e-05, 'epoch': 1.99}
{'loss': 2.9716, 'learning_rate': 1.6678695061834106e-05, 'epoch': 2.01}
{'loss': 2.8537, 'learning_rate': 1.63735000183484e-05, 'epoch': 2.02}
{'loss': 2.8504, 'learning_rate': 1.6067693361549094e-05, 'epoch': 2.04}
{'loss': 2.8578, 'learning_rate': 1.576188670474979e-05, 'epoch': 2.06}
{'loss': 2.8566, 'learning_rate': 1.5456691661264083e-05, 'epo

Saving model checkpoint to ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-60000
Configuration saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-60000\config.json


{'loss': 2.8618, 'learning_rate': 1.3622463333781852e-05, 'epoch': 2.19}


Model weights saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-60000\pytorch_model.bin


{'loss': 2.8531, 'learning_rate': 1.3316656676982545e-05, 'epoch': 2.21}
{'loss': 2.851, 'learning_rate': 1.3010850020183241e-05, 'epoch': 2.22}
{'loss': 2.8522, 'learning_rate': 1.2705043363383933e-05, 'epoch': 2.24}
{'loss': 2.8534, 'learning_rate': 1.239984831989823e-05, 'epoch': 2.26}
{'loss': 2.8613, 'learning_rate': 1.2094041663098923e-05, 'epoch': 2.28}
{'loss': 2.8597, 'learning_rate': 1.1788235006299619e-05, 'epoch': 2.3}
{'loss': 2.8528, 'learning_rate': 1.1482428349500312e-05, 'epoch': 2.32}
{'loss': 2.8613, 'learning_rate': 1.1176621692701008e-05, 'epoch': 2.33}
{'loss': 2.8557, 'learning_rate': 1.0870815035901702e-05, 'epoch': 2.35}
{'loss': 2.8584, 'learning_rate': 1.0565008379102396e-05, 'epoch': 2.37}
{'loss': 2.8589, 'learning_rate': 1.0259201722303091e-05, 'epoch': 2.39}
{'loss': 2.8555, 'learning_rate': 9.954006678817384e-06, 'epoch': 2.41}
{'loss': 2.8543, 'learning_rate': 9.64820002201808e-06, 'epoch': 2.43}
{'loss': 2.8622, 'learning_rate': 9.342393365218773e-06, 

Saving model checkpoint to ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-70000
Configuration saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-70000\config.json


{'loss': 2.8531, 'learning_rate': 7.508776651050141e-06, 'epoch': 2.55}


Model weights saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-70000\pytorch_model.bin


{'loss': 2.8515, 'learning_rate': 7.202969994250836e-06, 'epoch': 2.57}
{'loss': 2.8498, 'learning_rate': 6.8971633374515305e-06, 'epoch': 2.59}
{'loss': 2.8517, 'learning_rate': 6.591356680652224e-06, 'epoch': 2.61}
{'loss': 2.8583, 'learning_rate': 6.285550023852919e-06, 'epoch': 2.63}
{'loss': 2.8487, 'learning_rate': 5.979743367053614e-06, 'epoch': 2.64}
{'loss': 2.8498, 'learning_rate': 5.674548323567907e-06, 'epoch': 2.66}
{'loss': 2.852, 'learning_rate': 5.368741666768602e-06, 'epoch': 2.68}
{'loss': 2.8465, 'learning_rate': 5.062935009969298e-06, 'epoch': 2.7}
{'loss': 2.8481, 'learning_rate': 4.757128353169992e-06, 'epoch': 2.72}
{'loss': 2.8429, 'learning_rate': 4.451933309684286e-06, 'epoch': 2.74}
{'loss': 2.8465, 'learning_rate': 4.1461266528849806e-06, 'epoch': 2.75}
{'loss': 2.8453, 'learning_rate': 3.840319996085675e-06, 'epoch': 2.77}
{'loss': 2.8446, 'learning_rate': 3.5345133392863696e-06, 'epoch': 2.79}
{'loss': 2.8457, 'learning_rate': 3.2287066824870647e-06, 'epoc

Saving model checkpoint to ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-80000
Configuration saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-80000\config.json


{'loss': 2.8567, 'learning_rate': 1.3944783550048318e-06, 'epoch': 2.92}


Model weights saved in ../CLIP-GPT2/models/gpt2-large-AVA\checkpoint-80000\pytorch_model.bin


{'loss': 2.8581, 'learning_rate': 1.0886716982055265e-06, 'epoch': 2.94}
{'loss': 2.8443, 'learning_rate': 7.828650414062213e-07, 'epoch': 2.95}
{'loss': 2.8479, 'learning_rate': 4.770583846069161e-07, 'epoch': 2.97}
{'loss': 2.8435, 'learning_rate': 1.7125172780761094e-07, 'epoch': 2.99}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 19054.3297, 'train_samples_per_second': 34.533, 'train_steps_per_second': 4.317, 'train_loss': 2.258287075406473, 'epoch': 3.0}


TrainOutput(global_step=82251, training_loss=2.258287075406473, metrics={'train_runtime': 19054.3297, 'train_samples_per_second': 34.533, 'train_steps_per_second': 4.317, 'train_loss': 2.258287075406473, 'epoch': 3.0})

# Reset Runtime

In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import pipeline

In [2]:
model_path = "../CLIP-GPT2/models/gpt2-large"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

m_loc = "../CLIP-GPT2/models/gpt2-large-AVA/checkpoint-80000"
model = GPT2LMHeadModel.from_pretrained(m_loc)

In [4]:
prefix = "I like this image, but "
tokens = tokenizer.encode(prefix)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
output = pipe(prefix, max_new_tokens=50, num_return_sequences=5, pad_token_id=50256, num_beams=5)

output

[{'generated_text': "I like this image, but ive seen a lot of them. i think this is the best of the lot. i really like the composition and the colors. the only thing i don't like is the border. i think it takes away from the simplicity of the shot. i"},
 {'generated_text': 'I like this image, but ive seen so many of them now that its starting to get a little boring. this is a nice image. i really like the angle from which you shot this image. i also like the fact that you shot this image in black and white. nice'},
 {'generated_text': 'I like this image, but ive never been able to pull it off as well as you have here. i love the way the light is hitting the top of the glass, and the way it reflects on the table below. great job. i really like this one. the only'},
 {'generated_text': 'I like this image, but ive never been able to pull it off. great job. this is a great idea, but i think it would have been better if you could have gotten rid of the shadow in the bottom of the picture. g