In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.27.1


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [None]:
!nvidia-smi

Fri Mar 17 17:11:12 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P0    26W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
torch.manual_seed(42)

<torch._C.Generator at 0x7fdb49e042f0>

### Loading GPT2-Medium Model from 🤗 Model Hub 

In [None]:


tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
model.resize_token_embeddings(len(tokenizer))


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50259, 1024)

In [None]:
descriptions = pd.read_csv('netflix_titles.csv')['description']

In [None]:
max_length = max([len(tokenizer.encode(description)) for description in descriptions])

In [None]:
class NetflixDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
dataset = NetflixDataset(descriptions, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])


In [None]:
import gc
gc.collect()

0

In [None]:
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')


In [None]:
Trainer(model=model,  
        args=training_args, 
        train_dataset=train_dataset, 
        eval_dataset=val_dataset, 
        data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                    'attention_mask': torch.stack([f[1] for f in data]),
                                    'labels': torch.stack([f[0] for f in data])}) .train()



Step,Training Loss
100,5.5004
200,2.0131
300,1.8788
400,1.8647
500,1.9156
600,1.8732
700,1.8781
800,1.8983
900,1.8288
1000,1.8262


TrainOutput(global_step=7926, training_loss=1.8257081573008889, metrics={'train_runtime': 1929.9988, 'train_samples_per_second': 4.107, 'train_steps_per_second': 4.107, 'total_flos': 891356768944128.0, 'train_loss': 1.8257081573008889, 'epoch': 1.0})

### GPT Generated Description

In [None]:
generated = tokenizer("<|startoftext|> ", return_tensors="pt").input_ids.cuda()

In [None]:
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=20)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

0:  丁This original comedy series, made as simple to live by "Buffy," shows a gang of kids as the ultimate group of ninja assassins.
1:  urnged the fate of his friend by plunging in a high, black and empty-brained comedy routine. Now it’s over on all manner of occasions.
2:   : Comic strip legend Michael De Luca performs these outrageous ideas — while giving one heckled crowd a funny and embarrassing twist on her life.
3:  ‎Frenemies from another woman‌s past come to a virtual encounter to seek out each of their victims, which results in tragic mayhem in the flesh. Based of popular culture and true events.
4:  ㅋㅋ How else can a young-looking rapper marry the mother that's given everything by four straight guys? He learns they were all his friends when age mattered most? Well,After, too, upon learning the sonars of a royal letters about 15 of the or if the exact as to move. Group II: of this letter is in NoëdextentC child schedules information available dates, ifakth Division, alonger ra

### Original Description (Random)

In [None]:
pd.options.display.max_colwidth = 1000
descriptions.sample(10)

4970                       Three buddies with big dreams go from underachieving slackers to badass warriors when their posh hotel is taken over by terrorists.
3362         In his first stand-up special, Arsenio Hall discusses getting older, the changing times and culture, social issues and even bothersome baby toes.
5494                                                  Music meets imagination in this inventive animated series about thinking outside the box and having fun.
1688                        Explore an array of unique competitions, from the quirky to the bizarre, and meet their passionate communities in this docuseries.
1349         From his days as a petty thief to becoming head of a drug-trafficking empire, this riveting series charts the life of the infamous Pablo Escobar.
4862        This anime adventure follows the battle between a saint of Athena and an avatar of Hades who's working on a painting that could destroy the world.
2676     A top Israeli agent comes out of reti

In [None]:
from huggingface_hub import login
login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
model.push_to_hub("harouzie/gpt-netflix")

pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/harouzie/gpt-netflix/commit/5e5e7d31a4f3a29f90ce05d60ab641d45b29d5b0', commit_message='Upload model', commit_description='', oid='5e5e7d31a4f3a29f90ce05d60ab641d45b29d5b0', pr_url=None, pr_revision=None, pr_num=None)