In [1]:
import torch
from torchvision import models, transforms
from PIL import Image

In [2]:
#Load DenseNet model
model = models.densenet121(pretrained=True)
model.eval()

#convert input image to tensor and normalize
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

#identify objects in image
def predict_image(image_path):
    #load up image
    image = transform(Image.open(image_path)).unsqueeze(0)

    with torch.no_grad():
        outputs = model(image)

    #probabilities
    probabilities = torch.nn.functional.softmax(outputs[0], dim=0)

    # Load the labels (Downloading the labels from a hosted file for this example)
    with open('imagenet_classes.txt', 'r') as f:
        labels = [line.strip() for line in f.readlines()]

    #top 5 objects
    top5_prob, top5_catid = torch.topk(probabilities, 5)
    for i in range(top5_prob.size(0)):
        print(labels[top5_catid[i]], top5_prob[i].item())



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
#load dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/script project/scripts.csv')

In [5]:
df['Dialogue'] = df['Dialogue'].fillna('').astype(str)
df = df[['Character', 'Dialogue']]

#group by Character and concatenate dialogues separated by new line
grouped_df = df.groupby('Character')['Dialogue'].apply(lambda x: "\n".join(x)).reset_index()

#split data into training and validation sets
train, val = train_test_split(grouped_df, test_size=0.1, random_state=42)

In [6]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [7]:
#init tokenizer/model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [14]:
from datasets import load_dataset, DatasetDict
from transformers import DataCollatorForLanguageModeling

In [9]:
#puttinge train and validation data to txt files
train['Dialogue'].to_csv('train.txt', index=False, header=False)
val['Dialogue'].to_csv('val.txt', index=False, header=False)

In [10]:
#load dataset from text file
def load_dataset_from_file(train_file, val_file):
    train_dataset = load_dataset('text', data_files={'train': train_file})['train']
    val_dataset = load_dataset('text', data_files={'validation': val_file})['validation']

    # Tokenizing the datasets
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

    tokenized_datasets = DatasetDict({
        'train': train_dataset.map(tokenize_function, batched=True),
        'validation': val_dataset.map(tokenize_function, batched=True)
    })

    return tokenized_datasets

dataset = load_dataset_from_file('train.txt', 'val.txt')

Map:   0%|          | 0/11232 [00:00<?, ? examples/s]

In [None]:
!pip install transformers[torch] -U

In [15]:
training_args = TrainingArguments(
    output_dir="./gpt2_finetuned_seinfeld",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,  #GPU memory
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',  #storing logs
    logging_steps=100,  #metrics every 100 steps
    evaluation_strategy="epoch",
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.5288,3.431854
2,3.1331,3.407526
3,2.9354,3.440931


TrainOutput(global_step=32538, training_loss=3.2244258706868187, metrics={'train_runtime': 5978.3482, 'train_samples_per_second': 21.771, 'train_steps_per_second': 5.443, 'total_flos': 8501920137216000.0, 'train_loss': 3.2244258706868187, 'epoch': 3.0})

In [17]:
model.save_pretrained("./gpt2_finetuned_seinfeld_model")
tokenizer.save_pretrained("./gpt2_finetuned_seinfeld_tokenizer")

('./gpt2_finetuned_seinfeld_tokenizer/tokenizer_config.json',
 './gpt2_finetuned_seinfeld_tokenizer/special_tokens_map.json',
 './gpt2_finetuned_seinfeld_tokenizer/vocab.json',
 './gpt2_finetuned_seinfeld_tokenizer/merges.txt',
 './gpt2_finetuned_seinfeld_tokenizer/added_tokens.json')

In [26]:
def generate_dialogue(seed_text, model_path, tokenizer_path, max_length=50):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

    # Encode the seed text to tensor
    input_ids = tokenizer.encode(seed_text, return_tensors='pt')

    # Generate text
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, temperature=0.7)

    # Decode and print the output text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(generated_text)

# Example usage
generate_dialogue("wow", "./gpt2_finetuned_seinfeld_model", "./gpt2_finetuned_seinfeld_tokenizer")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


wow. (to George) You know, I think I'm gonna go see him tonight. (to Jerry) I'm gonna be in the audience. (to George) You know, I'm gonna be in the audience. I'm gonna be


In [27]:
image_path = '/content/drive/MyDrive/Colab Notebooks/script project/test_image_1.jpg'
