In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import random
import math

import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from transformers import pipeline
from transformers import GPT2Tokenizer, GPT2Model


from datasets import load_dataset
from datasets import ClassLabel
from datasets import Dataset


from IPython.display import display, HTML

2024-05-06 14:17:07.025289: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv("example_dataset/lyrics_dataset.csv")
pac = pd.read_csv("lyric_files/2pac.csv", index_col=False)
pac = pac.drop('Unnamed: 0', axis=1)

In [3]:
def unicodetoascii(text):
    ascii_code = (text.replace('\xe2\x80\x99', "'").
            replace('\xc3\xa9', 'e').
            replace('\xe2\x80\x90', '-').
            replace('\xe2\x80\x91', '-').
            replace('\xe2\x80\x92', '-').
            replace('\xe2\x80\x93', '-').
            replace('\xe2\x80\x94', '-').
            replace('\xe2\x80\x94', '-').
            replace('\xe2\x80\x98', "'").
            replace('\xe2\x80\x9b', "'").
            replace('\xe2\x80\x9c', '"').
            replace('\xe2\x80\x9c', '"').
            replace('\xe2\x80\x9d', '"').
            replace('\xe2\x80\x9e', '"').
            replace('\xe2\x80\x9f', '"').
            replace('\xe2\x80\xa6', '...').#
            replace('\xe2\x80\xb2', "'").
            replace('\xe2\x80\xb3', "'").
            replace('\xe2\x80\xb4', "'").
            replace('\xe2\x80\xb5', "'").
            replace('\xe2\x80\xb6', "'").
            replace('\xe2\x80\xb7', "'").
            replace('\xe2\x81\xba', "+").
            replace('\xe2\x81\xbb', "-").
            replace('\xe2\x81\xbc', "=").
            replace('\xe2\x81\xbd', "(").
            replace('\xe2\x81\xbe', ")").
            replace('\n', " \n ").
            replace('\n \n \n ', " \n \n ").
            replace('\n  \n  \n ', " \n \n ").
            replace('\r', "").
            strip('. ').
            strip('\n ')
            )
    return ascii_code

In [4]:
df["Lyrics"] = df['Lyrics'].apply(lambda x: unicodetoascii(x))

In [5]:
# define a function that cleans consecutive duplicated phrases and removes ad-libs from a list of strings

def remove_dups_ad_libs(l):
    
    for i in range(len(l)):
        phrase = l[i]
        s_phrase = phrase.strip(' ')
        if s_phrase:
            if s_phrase[0]=='(' and s_phrase[-1]==')': # check if the phrase is an ad-lib and remove it
                l[i]=""
        if i < len(l) - 2 and (s_phrase == l[i+1].strip(' ') or s_phrase == l[i+2].strip(' ')): # remove duplicated phrases and leave the last occurence
            l[i]= ""
    l = l[:-1]
    return l

In [6]:
# applying remove_dups_ad_libs function
for row in range(df.shape[0]):
    lyrics = df.iloc[row,2]
    l = lyrics.split('\n')
    clean_l = remove_dups_ad_libs(l)
    while True:
        try:
            clean_l.remove('')
        except:
            break
    clean_lyrics = '\n'.join(clean_l)
    df.iloc[row,2] = clean_lyrics

In [9]:
print(len(df))
df = df.dropna(subset=['Lyrics'])
print(len(df))

3614
3614


In [10]:
train, test = train_test_split(df, test_size=.05, random_state=42)

In [11]:
df_train_Lyrics = train['Lyrics']
df_train_Lyrics.to_csv('Train_rap_bot.csv', index=False)

df_val_Lyrics = test['Lyrics']
df_val_Lyrics.to_csv('Val_rap_bot.csv', index=False)

In [12]:
path_to_train = './Train_rap_bot.csv'
path_to_validation = './Val_rap_bot.csv'
datasets = load_dataset("csv", data_files={"train": path_to_train, "validation": path_to_validation})


Downloading and preparing dataset csv/default to /Users/lucasderr/.cache/huggingface/datasets/csv/default-ba9de918dfc59e02/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/lucasderr/.cache/huggingface/datasets/csv/default-ba9de918dfc59e02/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token

In [14]:
def tokenization(data):
    tokens = tokenizer(data["Lyrics"], padding="max_length", truncation=True, max_length=1024)
    return tokens

In [32]:
train_tokens = []

for lyric in datasets['train']:
    try:
        tokens = tokenizer(text=lyric['Lyrics'])
        train_tokens.append(tokens)
    except:
        continue
    
train_tokens = Dataset.from_pandas(pd.DataFrame(train_tokens))
    
    
test_tokens = []

for lyric in datasets['validation']:
    try:
        tokens = tokenizer(lyric["Lyrics"])
        test_tokens.append(tokens)
    except:
        continue
test_tokens = Dataset.from_pandas(pd.DataFrame(test_tokens))

In [36]:
block_size = 256

In [37]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [38]:
lm_dataset_train = train_tokens.map(group_texts,batched=True,batch_size=8)
lm_dataset_test = test_tokens.map(group_texts,batched=True,batch_size=8)

Map:   0%|          | 0/3429 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

In [39]:
lm_dataset_train

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 12406
})

In [41]:
model_name="distilgpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)

In [42]:
model_name_new = model_name.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name_new}-freestyle-bot",
    evaluation_strategy = "epoch",
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    num_train_epochs=7.0,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps = 100.0,
)


'distilgpt2'

In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset_train,
    eval_dataset=lm_dataset_test
)

In [48]:
import torch
print(torch.__version__)
torch.cuda.is_available()


2.2.2


False

In [None]:
trainer.train()