In [37]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import torch
import datasets
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
from bleu import list_bleu
import sacrebleu

In [41]:
file_path = 'lyrics/TaylorSwift.csv'
lyric = pd.read_csv(file_path)['Lyric'].dropna().reset_index(drop=True)

In [42]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token



In [43]:
size = lyric.shape[0]
print(size)
max_length = 1024
lyric = lyric.astype(str)
lyric.dtype
# lyric[140]

477


dtype('O')

In [44]:
for i in range(size):
    if len(lyric[i].split()) > 1024:
        print(i)
        lyric.pop(i)

224
264
348
403
407


In [45]:
lyric = lyric.reset_index(drop=True)
size = lyric.shape[0]
size

472

In [46]:
tokenizer.padding_side = "left"
tokenized_lyric = []
for i in range(size):
    text = lyric[i]
    tokenized_lyric.append(tokenizer(text, max_length=max_length, truncation=True, padding='max_length', return_tensors="pt"))
    tokenized_lyric[i]['labels'] = tokenized_lyric[i]['input_ids'].clone()

In [8]:
tokenized_lyric

[{'input_ids': tensor([[50256, 50256, 50256,  ...,   373,   534,  4004]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1]]), 'labels': tensor([[50256, 50256, 50256,  ...,   373,   534,  4004]])},
 {'input_ids': tensor([[50256, 50256, 50256,  ...,  1051, 29042, 29042]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1]]), 'labels': tensor([[50256, 50256, 50256,  ...,  1051, 29042, 29042]])},
 {'input_ids': tensor([[50256, 50256, 50256,  ...,   616,   616, 18854]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1]]), 'labels': tensor([[50256, 50256, 50256,  ...,   616,   616, 18854]])},
 {'input_ids': tensor([[50256, 50256, 50256,  ...,   530,   267,  1219]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1]]), 'labels': tensor([[50256, 50256, 50256,  ...,   530,   267,  1219]])},
 {'input_ids': tensor([[50256, 50256, 50256,  ...,   925,   502,   466]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1]]), 'labels': tensor([[50256, 50256, 50256,  ...,   925,   502,   466]])},


In [63]:
train_lyric = tokenized_lyric[:400]
test_lyric = tokenized_lyric[400:]

In [67]:
train_arguments = TrainingArguments(
    output_dir="./lyric_results",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=train_arguments,
    train_dataset=train_lyric,
    tokenizer=tokenizer,
)

trainer.train()

  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [37]:
for i in range(size):
    text = lyric[i]
    tokenized_lyric = tokenizer(text, return_tensors="pt")

ValueError: Input 0      vintage tee brand new phone high heels on cobb...
1      justin vernon i can see you standing honey wit...
2      we could leave the christmas lights up 'til ja...
3      i'm doing good i'm on some new shit been sayin...
4      i don't like your little games don't like your...
                             ...                        
474    drew looks at me i fake a smile so he won't se...
475    to put it plainly we just couldnt stop writing...
476    turn wycd on you're on your grunwald back from...
477    zwrotka  siedzę i patrzę jak czytasz z głową p...
478    trying just like they say just taking the step...
Name: Lyric, Length: 477, dtype: object is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

In [None]:
model.save_pretrained("./lyric_model")

In [13]:
model = GPT2LMHeadModel.from_pretrained("./lyric_model")

In [64]:
test_lyric[0]

{'input_ids': tensor([[50256, 50256, 50256,  ...,  1312,  6151,   326]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1]]), 'labels': tensor([[50256, 50256, 50256,  ...,  1312,  6151,   326]])}

In [65]:
min_length = 1024
to_delete = []
print(size)
for i in range(size-400):
    print(i)
    if np.sum(test_lyric[i]['attention_mask'].numpy()) < 100:
        # min_length = np.sum(test_lyric[i]['attention_mask'].numpy())
        # test_lyric.pop(i)
        print(i)
        to_delete.append(i)
        # size -= 1
print(to_delete)
for i in range(len(to_delete)):
    test_lyric.pop(to_delete[i]-i)
print(min_length)

472
0
1
2
3
4
5
6
7
8
9
9
10
10
11
12
13
14
15
16
17
17
18
19
20
20
21
22
22
23
24
25
25
26
27
28
29
30
31
32
32
33
34
35
36
37
38
39
40
40
41
42
43
44
45
46
47
48
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
69
70
71
[9, 10, 17, 20, 22, 25, 32, 40, 48, 69]
1024


In [66]:
print(len(test_lyric))

62


In [80]:
def process_test():
    for i in range(len(test_lyric)):
        test_lyric[i]['input_ids'][0] = torch.concat((torch.tensor(np.full((100,), 50256)), test_lyric[i]['input_ids'][0]))[:max_length]
        test_lyric[i]['attention_mask'][0] = torch.concat((torch.tensor(np.full((100,), 50256)), test_lyric[i]['attention_mask'][0]))[:max_length]

In [81]:
process_test()

In [82]:
test_lyric

[{'input_ids': tensor([[50256, 50256, 50256,  ...,   373, 17717,   259]]), 'attention_mask': tensor([[50256, 50256, 50256,  ...,     1,     1,     1]]), 'labels': tensor([[50256, 50256, 50256,  ...,  1312,  6151,   326]])},
 {'input_ids': tensor([[50256, 50256, 50256,  ...,  1183,   651,  1365]]), 'attention_mask': tensor([[50256, 50256, 50256,  ...,     1,     1,     1]]), 'labels': tensor([[50256, 50256, 50256,  ...,   345,   423,   284]])},
 {'input_ids': tensor([[50256, 50256, 50256,  ...,   345,   523,  1312]]), 'attention_mask': tensor([[50256, 50256, 50256,  ...,     1,     1,     1]]), 'labels': tensor([[50256, 50256, 50256,  ...,  1312,  2051,   345]])},
 {'input_ids': tensor([[50256, 50256, 50256,  ...,   345,   910,   262]]), 'attention_mask': tensor([[50256, 50256, 50256,  ...,     1,     1,     1]]), 'labels': tensor([[50256, 50256, 50256,  ...,   338,   616,   582]])},
 {'input_ids': tensor([[50256, 50256, 50256,  ...,   616,  1182,   705]]), 'attention_mask': tensor([[50

In [87]:
tokenizer.pad_token_id = tokenizer.eos_token_id
for i in range(len(test_lyric)):
    output = model.generate(input_ids=test_lyric[i]['input_ids'], attention_mask=test_lyric[i]['attention_mask'], max_new_tokens = 100, num_return_sequences=1)
    print(tokenizer.decode(output[0]))
    print(tokenizer.decode(test_lyric[i]['labels'][0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


IndexError: index out of range in self

In [9]:
lyric

0      vintage tee brand new phone high heels on cobb...
1      justin vernon i can see you standing honey wit...
2      we could leave the christmas lights up 'til ja...
3      i'm doing good i'm on some new shit been sayin...
4      i don't like your little games don't like your...
                             ...                        
467    drew looks at me i fake a smile so he won't se...
468    to put it plainly we just couldnt stop writing...
469    turn wycd on you're on your grunwald back from...
470    zwrotka  siedzę i patrzę jak czytasz z głową p...
471    trying just like they say just taking the step...
Name: Lyric, Length: 472, dtype: object

In [56]:
model = GPT2LMHeadModel.from_pretrained("./lyric_model")
gpt2 = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")



In [51]:
tot_size = len(lyric) - 1
print(tot_size)
test_size = 72
max_token = 100

471


In [60]:
score_all_model = []
score_all_gpt2 = []
for i in range(test_size):
    ref = [lyric[tot_size - i]]
    # tokenizer.pad_token_id = tokenizer.eos_token_id
    prompt = tokenizer(lyric[tot_size - i], max_length=max_token, truncation=True, return_tensors="pt")
    output_model = model.generate(input_ids=prompt['input_ids'], attention_mask=prompt['attention_mask'], max_new_tokens=max_token, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    output_gpt2 = gpt2.generate(input_ids=prompt['input_ids'], attention_mask=prompt['attention_mask'], max_new_tokens=max_token, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    score_model = sacrebleu.sentence_bleu(tokenizer.decode(output_model[0]), ref)
    score_gpt2 = sacrebleu.sentence_bleu(tokenizer.decode(output_gpt2[0]), ref)
    print(score_model.score, score_gpt2.score)
    score_all_model.append(score_model.score)
    score_all_gpt2.append(score_gpt2.score)

51.76483331990518 50.15214217450629
0.611404440755515 0.2927100245574598
42.2319276951886 44.480173519342586
49.48259877112261 49.1320506200221
16.789369510042775 12.495747658751178
20.48597392871792 15.462826578031848
9.61473169643006 8.805804131025496
43.05215082009055 23.3457734837468
19.97762272132689 18.999258213588007
19.510893947423178 19.49978970775825
27.200068662962394 26.750088613977116
9.488815450519796 9.705050805997313
10.886751171833199 10.4533552974009
43.15399680098916 47.63654824867901
18.81299726561037 18.76075279675465
13.791258457556518 15.923604479234807
14.16521688398309 9.69605324813007
5.44512532237439 6.729229114728109
18.30193038821825 18.12072763933766
8.748236723829718 16.899474521391213
27.09355641853726 24.12758632228586
19.68458587135496 15.281651207687158
8.9660520212405 14.30202095162956
27.844115155498905 33.746036000330015
10.213745038221145 11.647483659846074
15.298188972303544 13.25690617297067
14.189527432273586 15.601583726750809
16.0403061526510