In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

from sklearn.model_selection import train_test_split
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(device)

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
model.resize_token_embeddings(len(tokenizer))
model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [3]:
data_path = "netflix_titles.csv"
titles = pd.read_csv(data_path)['description']
titles.head()

0    As her father nears the end of his life, filmm...
1    After crossing paths at a party, a Cape Town t...
2    To protect his family from a powerful drug lor...
3    Feuds, flirtations and toilet talk go down amo...
4    In a city of coaching centers known to train I...
Name: description, dtype: object

In [4]:
max_length = max([len(tokenizer.encode(title.strip())) for title in titles])
print("max_length : ",max_length)

max_length :  62


In [6]:
train_titles, test_titles = train_test_split(titles, test_size=10)
print(len(train_titles))
print(train_titles[0])
print(len(test_titles))

8797
As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.
10


In [6]:
class NetflixDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>',
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [7]:
dataset = NetflixDataset(train_titles, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [8]:
"""
for temp in train_dataset:
    #print(temp[0])
    #print((temp[1]))
    
    #print(len(temp[0]))
    #print(len(temp[1]))
    if len(temp[0])!=len(temp[1]) and len(temp[0])==62:
        print(temp[0])
        print(temp[1])
    
        break

    """

'\nfor temp in train_dataset:\n    #print(temp[0])\n    #print((temp[1]))\n    \n    #print(len(temp[0]))\n    #print(len(temp[1]))\n    if len(temp[0])!=len(temp[1]) and len(temp[0])==62:\n        print(temp[0])\n        print(temp[1])\n    \n        break\n\n    '

In [9]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=100, save_steps=500,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')

In [10]:
next(model.parameters()).is_cuda

True

In [11]:
test_str = "Hello "

generated = tokenizer("<|startoftext|> "+ test_str, return_tensors="pt").input_ids
print(generated.cuda())
print("generated : ",len(generated))
sample_outputs = model.generate(generated.cuda(),no_repeat_ngram_size = 1,num_beams=20, num_return_sequences=2)

print(sample_outputs)
print(tokenizer.decode(sample_outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50257, 18435,   220]], device='cuda:0')
generated :  1




tensor([[50257, 18435,   220,   170, 50258, 27332,   226,   229,   171,   225,
           223,   159,   227,   235,  8582,   242,   107, 12520,   234,   230],
        [50257, 18435,   220,   170, 50258, 27332,   226,   229,   171,   225,
           223,   159,   227,   235,  8582,   242,   107, 12520,   234,   231]],
       device='cuda:0')
 Hello � ㅍ🔯 🌈


In [12]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 7917
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 7917


  0%|          | 0/7917 [00:00<?, ?it/s]

{'loss': 5.3517, 'learning_rate': 4.943088402681169e-05, 'epoch': 0.01}
{'loss': 2.2332, 'learning_rate': 4.879853294549134e-05, 'epoch': 0.03}
{'loss': 2.1065, 'learning_rate': 4.8166181864170986e-05, 'epoch': 0.04}
{'loss': 2.0882, 'learning_rate': 4.753383078285064e-05, 'epoch': 0.05}


Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json


{'loss': 2.0953, 'learning_rate': 4.690147970153029e-05, 'epoch': 0.06}


Model weights saved in ./results\checkpoint-500\pytorch_model.bin


{'loss': 2.2692, 'learning_rate': 4.626912862020994e-05, 'epoch': 0.08}
{'loss': 2.123, 'learning_rate': 4.563677753888959e-05, 'epoch': 0.09}
{'loss': 2.0689, 'learning_rate': 4.500442645756924e-05, 'epoch': 0.1}
{'loss': 2.0445, 'learning_rate': 4.43720753762489e-05, 'epoch': 0.11}


Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json


{'loss': 2.1044, 'learning_rate': 4.373972429492855e-05, 'epoch': 0.13}


Model weights saved in ./results\checkpoint-1000\pytorch_model.bin


{'loss': 2.0321, 'learning_rate': 4.31073732136082e-05, 'epoch': 0.14}
{'loss': 2.0382, 'learning_rate': 4.247502213228785e-05, 'epoch': 0.15}
{'loss': 2.0575, 'learning_rate': 4.1842671050967496e-05, 'epoch': 0.16}
{'loss': 2.0174, 'learning_rate': 4.1210319969647146e-05, 'epoch': 0.18}


Saving model checkpoint to ./results\checkpoint-1500
Configuration saved in ./results\checkpoint-1500\config.json


{'loss': 1.9917, 'learning_rate': 4.05779688883268e-05, 'epoch': 0.19}


Model weights saved in ./results\checkpoint-1500\pytorch_model.bin


{'loss': 2.1346, 'learning_rate': 3.994561780700645e-05, 'epoch': 0.2}
{'loss': 2.0206, 'learning_rate': 3.93132667256861e-05, 'epoch': 0.21}
{'loss': 2.0005, 'learning_rate': 3.868091564436576e-05, 'epoch': 0.23}
{'loss': 2.0055, 'learning_rate': 3.80485645630454e-05, 'epoch': 0.24}


Saving model checkpoint to ./results\checkpoint-2000
Configuration saved in ./results\checkpoint-2000\config.json


{'loss': 2.0618, 'learning_rate': 3.741621348172506e-05, 'epoch': 0.25}


Model weights saved in ./results\checkpoint-2000\pytorch_model.bin


{'loss': 2.046, 'learning_rate': 3.678386240040471e-05, 'epoch': 0.27}
{'loss': 1.9627, 'learning_rate': 3.6151511319084356e-05, 'epoch': 0.28}
{'loss': 1.9782, 'learning_rate': 3.551916023776401e-05, 'epoch': 0.29}
{'loss': 1.999, 'learning_rate': 3.488680915644366e-05, 'epoch': 0.3}


Saving model checkpoint to ./results\checkpoint-2500
Configuration saved in ./results\checkpoint-2500\config.json


{'loss': 2.0476, 'learning_rate': 3.4254458075123305e-05, 'epoch': 0.32}


Model weights saved in ./results\checkpoint-2500\pytorch_model.bin


{'loss': 2.0319, 'learning_rate': 3.362210699380296e-05, 'epoch': 0.33}
{'loss': 1.9654, 'learning_rate': 3.298975591248261e-05, 'epoch': 0.34}
{'loss': 2.0428, 'learning_rate': 3.235740483116226e-05, 'epoch': 0.35}
{'loss': 1.977, 'learning_rate': 3.172505374984192e-05, 'epoch': 0.37}


Saving model checkpoint to ./results\checkpoint-3000
Configuration saved in ./results\checkpoint-3000\config.json


{'loss': 1.9788, 'learning_rate': 3.109270266852157e-05, 'epoch': 0.38}


Model weights saved in ./results\checkpoint-3000\pytorch_model.bin


{'loss': 2.0198, 'learning_rate': 3.0460351587201213e-05, 'epoch': 0.39}
{'loss': 1.9844, 'learning_rate': 2.9828000505880866e-05, 'epoch': 0.4}
{'loss': 1.9495, 'learning_rate': 2.9195649424560516e-05, 'epoch': 0.42}
{'loss': 2.0786, 'learning_rate': 2.856329834324017e-05, 'epoch': 0.43}


Saving model checkpoint to ./results\checkpoint-3500
Configuration saved in ./results\checkpoint-3500\config.json


{'loss': 2.0732, 'learning_rate': 2.7930947261919822e-05, 'epoch': 0.44}


Model weights saved in ./results\checkpoint-3500\pytorch_model.bin


{'loss': 1.9365, 'learning_rate': 2.729859618059947e-05, 'epoch': 0.45}
{'loss': 2.0852, 'learning_rate': 2.6666245099279118e-05, 'epoch': 0.47}
{'loss': 1.9668, 'learning_rate': 2.603389401795877e-05, 'epoch': 0.48}
{'loss': 2.0481, 'learning_rate': 2.5401542936638424e-05, 'epoch': 0.49}


Saving model checkpoint to ./results\checkpoint-4000
Configuration saved in ./results\checkpoint-4000\config.json


{'loss': 1.9954, 'learning_rate': 2.4769191855318073e-05, 'epoch': 0.51}


Model weights saved in ./results\checkpoint-4000\pytorch_model.bin


{'loss': 2.0307, 'learning_rate': 2.4136840773997723e-05, 'epoch': 0.52}
{'loss': 1.9881, 'learning_rate': 2.3504489692677376e-05, 'epoch': 0.53}
{'loss': 1.9509, 'learning_rate': 2.287213861135703e-05, 'epoch': 0.54}
{'loss': 1.9076, 'learning_rate': 2.2239787530036675e-05, 'epoch': 0.56}


Saving model checkpoint to ./results\checkpoint-4500
Configuration saved in ./results\checkpoint-4500\config.json


{'loss': 1.9406, 'learning_rate': 2.1607436448716328e-05, 'epoch': 0.57}


Model weights saved in ./results\checkpoint-4500\pytorch_model.bin


{'loss': 1.9384, 'learning_rate': 2.097508536739598e-05, 'epoch': 0.58}
{'loss': 1.9487, 'learning_rate': 2.034273428607563e-05, 'epoch': 0.59}
{'loss': 1.9556, 'learning_rate': 1.971038320475528e-05, 'epoch': 0.61}
{'loss': 1.9785, 'learning_rate': 1.9078032123434934e-05, 'epoch': 0.62}


Saving model checkpoint to ./results\checkpoint-5000
Configuration saved in ./results\checkpoint-5000\config.json


{'loss': 2.0206, 'learning_rate': 1.8445681042114583e-05, 'epoch': 0.63}


Model weights saved in ./results\checkpoint-5000\pytorch_model.bin


{'loss': 1.9662, 'learning_rate': 1.7813329960794233e-05, 'epoch': 0.64}
{'loss': 1.9842, 'learning_rate': 1.7180978879473886e-05, 'epoch': 0.66}
{'loss': 2.0086, 'learning_rate': 1.6548627798153535e-05, 'epoch': 0.67}
{'loss': 1.9608, 'learning_rate': 1.5916276716833185e-05, 'epoch': 0.68}


Saving model checkpoint to ./results\checkpoint-5500
Configuration saved in ./results\checkpoint-5500\config.json


{'loss': 2.0461, 'learning_rate': 1.5283925635512838e-05, 'epoch': 0.69}


Model weights saved in ./results\checkpoint-5500\pytorch_model.bin


{'loss': 1.965, 'learning_rate': 1.4651574554192488e-05, 'epoch': 0.71}
{'loss': 2.0018, 'learning_rate': 1.4019223472872139e-05, 'epoch': 0.72}
{'loss': 1.9851, 'learning_rate': 1.3386872391551792e-05, 'epoch': 0.73}
{'loss': 1.9579, 'learning_rate': 1.275452131023144e-05, 'epoch': 0.75}


Saving model checkpoint to ./results\checkpoint-6000
Configuration saved in ./results\checkpoint-6000\config.json


{'loss': 1.9575, 'learning_rate': 1.2122170228911093e-05, 'epoch': 0.76}


Model weights saved in ./results\checkpoint-6000\pytorch_model.bin


{'loss': 1.9087, 'learning_rate': 1.1489819147590743e-05, 'epoch': 0.77}
{'loss': 2.0082, 'learning_rate': 1.0857468066270394e-05, 'epoch': 0.78}
{'loss': 2.0013, 'learning_rate': 1.0225116984950045e-05, 'epoch': 0.8}
{'loss': 2.008, 'learning_rate': 9.592765903629695e-06, 'epoch': 0.81}


Saving model checkpoint to ./results\checkpoint-6500
Configuration saved in ./results\checkpoint-6500\config.json


{'loss': 1.9353, 'learning_rate': 8.960414822309346e-06, 'epoch': 0.82}


Model weights saved in ./results\checkpoint-6500\pytorch_model.bin


{'loss': 1.9819, 'learning_rate': 8.328063740988997e-06, 'epoch': 0.83}
{'loss': 1.9105, 'learning_rate': 7.695712659668649e-06, 'epoch': 0.85}
{'loss': 1.9406, 'learning_rate': 7.0633615783482984e-06, 'epoch': 0.86}
{'loss': 1.8958, 'learning_rate': 6.431010497027951e-06, 'epoch': 0.87}


Saving model checkpoint to ./results\checkpoint-7000
Configuration saved in ./results\checkpoint-7000\config.json


{'loss': 1.9907, 'learning_rate': 5.798659415707601e-06, 'epoch': 0.88}


Model weights saved in ./results\checkpoint-7000\pytorch_model.bin


{'loss': 1.9228, 'learning_rate': 5.166308334387252e-06, 'epoch': 0.9}
{'loss': 1.8919, 'learning_rate': 4.533957253066903e-06, 'epoch': 0.91}
{'loss': 1.9164, 'learning_rate': 3.901606171746554e-06, 'epoch': 0.92}
{'loss': 1.992, 'learning_rate': 3.269255090426205e-06, 'epoch': 0.93}


Saving model checkpoint to ./results\checkpoint-7500
Configuration saved in ./results\checkpoint-7500\config.json


{'loss': 1.9206, 'learning_rate': 2.6369040091058556e-06, 'epoch': 0.95}


Model weights saved in ./results\checkpoint-7500\pytorch_model.bin


{'loss': 1.9883, 'learning_rate': 2.0045529277855064e-06, 'epoch': 0.96}
{'loss': 1.9983, 'learning_rate': 1.3722018464651576e-06, 'epoch': 0.97}
{'loss': 1.9633, 'learning_rate': 7.398507651448085e-07, 'epoch': 0.99}
{'loss': 1.9329, 'learning_rate': 1.0749968382445935e-07, 'epoch': 1.0}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 464.7978, 'train_samples_per_second': 17.033, 'train_steps_per_second': 17.033, 'train_loss': 2.0455152909834307, 'epoch': 1.0}


TrainOutput(global_step=7917, training_loss=2.0455152909834307, metrics={'train_runtime': 464.7978, 'train_samples_per_second': 17.033, 'train_steps_per_second': 17.033, 'train_loss': 2.0455152909834307, 'epoch': 1.0})

In [15]:
results = []
for title in test_titles: 
    new_titles = {
        'seed': title.split()[0],
        'predictions': []
    }
    generated = tokenizer("<|startoftext|> "+ title.split()[0], return_tensors="pt").input_ids.cuda()
    sample_outputs = model.generate(generated,no_repeat_ngram_size = 1,num_beams=20, num_return_sequences=2)
    
    new_titles['predictions'] = sample_outputs
    results.append(new_titles)
    

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

In [14]:
for new_title in results:
    print(f"seed: {new_title['seed']}")
    for i, pred in enumerate(new_title['predictions']):
        print(f"{i+1}: {tokenizer.decode(pred, skip_special_tokens=True)}")

seed: When
1:  When a young man falls in love with his girlfriend, he begins to suspect that she’
2:  When a young man falls in love with his older sister, he's forced to choose between her
seed: Ahead
1:  Ahead of his first birthday, a young man embarks on an epic journey to find the truth
2:  Ahead of his first birthday, a young man embarks on an epic journey to save the world
seed: A
1:  A young man's life is turned upside down when he falls in love with his best friend,
2:  A young man's life is turned upside down when he falls in love with his ex-wife
seed: This
1:  This docuseries takes a look at the rise and fall of Donald Trump's presidential campaign,
2:  This docuseries takes a look at the rise and fall of Donald Trump’s presidential
seed: An
1:  An ex-con returns to the U.S., where he reconnects with his estranged father
2:  An ex-con returns to the U.S., where he meets his estranged wife, who
seed: An
1:  An ex-con returns to his hometown of New York, where he’s haunted by
