# GPT-2 dataset - preprocessing

This notebook provides the code for preprocessing the dataset that will be used to finetune the GPT-2 model on heavy metal lyrics

## Import libraries

In [1]:
import nltk
import pandas as pd
import os
import re
import shutil
import string

from pandarallel import pandarallel
from readability.readability import Readability
from sklearn.model_selection import train_test_split
from tqdm import tqdm

nltk.download('punkt')
pandarallel.initialize()
tqdm.pandas()

[nltk_data] Downloading package punkt to /home/deepmetal/nltk_data...


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[nltk_data]   Unzipping tokenizers/punkt.zip.
  from pandas import Panel


In [2]:
dataset_eng_df = pd.read_csv('../datasets/metal_dataset_clean_eng.csv', encoding='utf-8', sep='|') \
                    .drop(columns=['lang_iso', 'lang_name'])

In [19]:
SWEAR_WORDS = [str(line.rstrip('\n')) for line in open("./common/resources/swear_words_eng.txt", "r")]
STOPWORDS = list(set([str(line.rstrip('\n')) for line in open("./common/resources/stopwords_eng.txt", "r")]))
PUNCTUATION =  list(string.punctuation) + ['..', '...', '’', "''", '``', '`']

def count_swear_word_ratio(text):
    counter = 0
    text = text.lower().replace('\\n', ' ')
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    text = text.translate(translator)
            
    for word in SWEAR_WORDS:
        counter += sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(word), text))
    number_of_words = len(nltk.word_tokenize(text))
    
    return counter / number_of_words

def measure_readability(text):
    text = text.replace('\\n', ' ')
    try:
        cl_grade = int(Readability(text).coleman_liau().grade_level)
    except Exception as e:
        cl_grade = 1
    
    return max(1, cl_grade)

def uppercase_counter(text):
    return sum(map(str.isupper, text.split()))

In [4]:
dataset_eng_df["swear_word_ratio"] = dataset_eng_df.lyrics.parallel_apply(count_swear_word_ratio)
dataset_eng_df["readability"] = dataset_eng_df.lyrics.parallel_apply(measure_readability)

print("**Dataset with swearwords and readability coefficients**")
dataset_eng_df.sample(n=5)

**Dataset with swearwords and readability coefficients**


Unnamed: 0,artist,album,album_type,release_year,title,track_no,lyrics,swear_word_ratio,readability
94054,Inbreeding Rednecks,Abnormal Life Portrayed,Studio,2013,Eyes Of Deception,3,Bow to the two legged demon\nBorn in a slow de...,0.010695,7
101549,Kick Axe,Vices,Studio,1984,All The Right Moves,9,Well hey have you seen her\nShe's all over tow...,0.0,7
66536,Epidemic,Exit Paradise,Studio,1994,Written In Blood,9,On stolen land you stand\nAnd rant of good and...,0.0,12
75232,Freedom Call,Legend Of The Shadowking,Studio,2010,Out Of The Ruins,1,No one's got the power to defy the tyranny\nOn...,0.0,7
35230,Cemetery Of Scream,Prelude To A Sentimental Journey,Studio,2000,Bridge To A Desert,12,The role of Jack-o-lanterns is to be a bridge\...,0.0,5


In [6]:
dataset_eng_df.to_csv('../datasets/dataset_eng_readability_data_df.csv', index=False, sep='|')

## Import and preprocess dataset

Set all the lyrics in a unique file and delimit each text with the strings `<|startoftext|>` and `<|endoftext|>`.

In [9]:
bands_to_exclude = [
    'Týr',  # Lyrics in both english and icelandic
]

In [15]:
metal_lyrics_df = pd.read_csv("../datasets/dataset_eng_readability_data_df.csv", sep="|")
print(f"Number of songs: {len(metal_lyrics_df)}")

metal_lyrics_df = metal_lyrics_df[metal_lyrics_df.readability > 1]
print(f"Number of songs after excluding those with readability = 1: {len(metal_lyrics_df)}")

metal_lyrics_df = metal_lyrics_df[metal_lyrics_df.swear_word_ratio < 0.06]
print(f"Number of songs after excluding those with swear word ratio >= 0.06: {len(metal_lyrics_df)}")

metal_lyrics_df = metal_lyrics_df[~metal_lyrics_df.artist.isin(bands_to_exclude)]
print(f"Number of songs after removing bands: {len(metal_lyrics_df)}")

Number of songs: 195202
Number of songs after excluding those with readability = 1: 155837
Number of songs after excluding those with swear word ratio >= 0.06: 154912
Number of songs after removing bands: 154838


In [16]:
metal_lyrics_df.to_csv('../datasets/cleaned_dataset_partial_df.csv', index=False, sep='|')

In [22]:
metal_lyrics_df["uppercase_counter"] = metal_lyrics_df.lyrics.parallel_apply(uppercase_counter)
metal_lyrics_df = metal_lyrics_df[metal_lyrics_df.uppercase_counter < 10]

metal_lyrics_df

Unnamed: 0,artist,album,album_type,release_year,title,track_no,lyrics,swear_word_ratio,readability,uppercase_counter
0,'68,In Humor And Sadness,Studio,2014,Track 1 R,1.0,Take your heart into the next room.\nMake a sc...,0.000000,5.0,1
1,'68,In Humor And Sadness,Studio,2014,Track 2 E,2.0,"It doesn't matter what you say,\nThey could al...",0.000000,7.0,2
2,'68,In Humor And Sadness,Studio,2014,Track 3 G,3.0,"Hey kid, don't listen to them.\nWe got a long,...",0.000000,3.0,1
3,'68,In Humor And Sadness,Studio,2014,Track 4 R,4.0,Stained glass.\nI wish that I could say that i...,0.000000,4.0,2
4,'68,In Humor And Sadness,Studio,2014,Track 6 T,6.0,Over thought.\nUndersold.\nI have sung every s...,0.000000,3.0,4
...,...,...,...,...,...,...,...,...,...,...
154080,Zørormr,Corpus Hermeticum,Studio,2015,In The Mouth Of Madness,3.0,Behind the curtains of sanity\nA slow descent ...,0.006369,8.0,1
154081,Zørormr,Corpus Hermeticum,Studio,2015,Worship Me...,8.0,"Hail Moloch, Hail The True God!\nHail Moloch, ...",0.000000,6.0,2
154082,Zørormr,Corpus Hermeticum,Studio,2015,This I Command!,9.0,"In the name of Poimandres,\nThis I command!\n\...",0.000000,6.0,5
154083,Zørormr,The Aftermath,EP,2016,The Last Judgement,1.0,Adonai!\nCleanse me from sin\nSo I shall be bo...,0.000000,5.0,4


In [23]:
metal_lyrics_df.to_csv('../datasets/cleaned_dataset_partial_df.csv', index=False, sep='|')

In [17]:
metal_lyrics_df = pd.read_csv('../datasets/cleaned_dataset_partial_df.csv', encoding='utf-8', sep='|')

In [4]:
train_test_ratio = 0.9
train_val_ratio = 0.75

df_full_train, df_test = train_test_split(
    metal_lyrics_df, 
    train_size=train_test_ratio, 
    random_state=1
)
df_train, df_valid = train_test_split(
    df_full_train, 
    train_size=train_val_ratio, 
    random_state=1
)

In [5]:
def build_dataset(df, dest_path):
    f = open(dest_path, 'w')
    data = ''
    summaries = df['lyrics'].tolist()
    for summary in summaries:
        summary = str(summary).strip()
        summary = summary.replace("\\n", "\n")
        bos_token = '<|startoftext|>'
        eos_token = '<|endoftext|>'
        data += bos_token + ' ' + summary + ' ' + eos_token + '\n'
        
    f.write(data)

In [6]:
build_dataset(df_train, '../datasets/deepmetal_train.txt')
build_dataset(df_valid, '../datasets/deepmetal_val.txt')
build_dataset(df_test, '../datasets/deepmetal_test.txt')