In [19]:
import numpy as np
import pandas as pd
import string
import nltk
import re
from sklearn.model_selection import train_test_split

# load lyrics data from MusicOSet 
# should show a dataframe with 20000 song_ids and their lyrics
df = pd.read_csv("musicoset_songfeatures/lyrics.csv", sep="\t")
df.info()
df.head()

# adding poems from The Poetry Foundation (14000 poems, author and tags assoiated with poem)
# we combined both sources to increase data quali5y
pdf = pd.read_csv('musicoset_songfeatures/PoetryFoundationData.csv',quotechar='"')
pdf.head()
df = df.dropna()

# initializing string stranslator to clean punctutation before training
translator = str.maketrans('', '', string.punctuation)

# splits lyrics into intro, verses, and chorus, only selects first 4 verses + chorus
def split_text(x):
   text = x['lyrics']
   sections = text.split('\\n\\n')
   keys = {'Verse 1': np.nan,'Verse 2':np.nan,'Verse 3':np.nan,'Verse 4':np.nan, 'Chorus':np.nan}
   lyrics = str()
   single_text = []
   res = {}
   for s in sections:
       key = s[s.find('[') + 1:s.find(']')].strip()
       if ':' in key:
           key = key[:key.find(':')]
          
       if key in keys:
           single_text += [x.lower().replace('(','').replace(')','').translate(translator) for x in s[s.find(']')+1:].split('\\n') if len(x) > 1]
       res['single_text'] =  ' \n '.join(single_text)
   return pd.Series(res)
# joins resulting text into a single text
df = df.join(df.apply(split_text, axis=1))
df.head()

print(df)

# # Testing
# lines = ''
# for i in df.head(1)['lyrics']:
#     lines = i.split('\\n\\n')
# print (lines)
# df['lyrics'].iloc[0]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20404 entries, 0 to 20403
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   song_id  20404 non-null  object
 1   lyrics   19663 non-null  object
dtypes: object(2)
memory usage: 318.9+ KB
                      song_id  \
0      3e9HZxeyfWwjeyPAMmWSSQ   
1      5p7ujcrUXASCNwRaWNHR1C   
2      2xLMifQCjDGFmkHkpNLD9h   
4      1rqqCSm0Qe4I9rUvWncaom   
5      0bYg9bo50gSsH3LtXe2SQn   
...                       ...   
20399  2pMAmZdHfQHyqJCXJbfhK3   
20400  0IaMMHVbpJ0LrRAeigWOXr   
20401  4nASzyRbzL5qZQuOPjQfsj   
20402  2F4FNcz68howQWD4zaGJSi   
20403  0TEQ2QmFXnHCgQvYuvsbp2   

                                                  lyrics  \
0      ['[Verse 1]\nThought I\'d end up with Sean\nBu...   
1      ["[Verse 1]\nFound you when your heart was bro...   
2      ['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun...   
4      ["[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t...   
5    

In [20]:
# cleaning poems text
pdf['single_text'] = pdf['Poem'].apply(lambda x: ' \n '.join([l.lower().strip().translate(translator) for l in x.splitlines() if len(l)>0]))
pdf.head()

# combine poems dataframe and lyrics dataframe
sum_df = pd.DataFrame( df['single_text'] )
sum_df = pd.concat([df, pd.DataFrame( pdf['single_text'])])
sum_df.dropna(inplace=True)

In [21]:
text_as_list = []
frequencies = {}
uncommon_words = set()
MIN_FREQUENCY = 7
MIN_SEQ = 5
BATCH_SIZE =  32

def extract_text(text):
   global text_as_list
   text_as_list += [w for w in text.split(' ') if w.strip() != '' or w == '\n']
   
df['single_text'].apply(extract_text)
print('Total words: ', len(text_as_list))
for w in text_as_list:
   frequencies[w] = frequencies.get(w, 0) + 1
  
uncommon_words = set([key for key in frequencies.keys() if frequencies[key] < MIN_FREQUENCY])
words = sorted(set([key for key in frequencies.keys() if frequencies[key] >= MIN_FREQUENCY]))
num_words = len(words)
word_indices = dict((w, i) for i, w in enumerate(words))
indices_word = dict((i, w) for i, w in enumerate(words))
print('Words with less than {} appearances: {}'.format( MIN_FREQUENCY, len(uncommon_words)))
print('Words with more than {} appearances: {}'.format( MIN_FREQUENCY, len(words)))
valid_seqs = []
end_seq_words = []
for i in range(len(text_as_list) - MIN_SEQ ):
   end_slice = i + MIN_SEQ + 1
   if len( set(text_as_list[i:end_slice]).intersection(uncommon_words) ) == 0:
       valid_seqs.append(text_as_list[i: i + MIN_SEQ])
       end_seq_words.append(text_as_list[i + MIN_SEQ])
      
print('Valid sequences of size {}: {}'.format(MIN_SEQ, len(valid_seqs)))
X_train, X_test, y_train, y_test = train_test_split(valid_seqs, end_seq_words, test_size=0.15, random_state=42)

Total words:  2339637
Words with less than 7 appearances: 26272
Words with more than 7 appearances: 9163
Valid sequences of size 5: 2087702


In [22]:
# load in pretrained tokenizer and model. Using GPT2
from transformers import GPT2Tokenizer, GPT2Model

tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")

model = GPT2Model.from_pretrained("openai-community/gpt2")



In [28]:
with open('train.txt', 'w') as f:
  for t in X_train:
    t = ' '.join(t)
    f.write(t)
    f.write(' ')


with open('test.txt', 'w') as f:
  for t in X_test:
    t = ' '.join(t)
    f.write(t)
    f.write(' ')

train_path = 'train.txt'
test_path = 'test.txt'

'flow like dead people i'

In [29]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
           tokenizer=tokenizer,
           file_path=train_path,
           block_size=128)

    test_dataset = TextDataset(
           tokenizer=tokenizer,
           file_path=test_path,
           block_size=128)

    data_collator = DataCollatorForLanguageModeling(
         tokenizer=tokenizer, mlm=False,
     )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



In [30]:
from transformers import TrainingArguments, Trainer

In [31]:
training_args = TrainingArguments(
    output_dir="./gpt-2", 
    overwrite_output_dir=True,
    num_train_epochs=300, 
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    eval_steps = 100, 
    save_steps=800, 
    warmup_steps=500
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [32]:
trainer.train()

ValueError: num_samples should be a positive integer value, but got num_samples=0

GPT2Tokenizer(name_or_path='openai-community/gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [33]:
train_dataset

<transformers.data.datasets.language_modeling.TextDataset at 0x26d9f9f1100>