In [1]:
import re
import time
import spacy
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocessing

The overall break down of preprocessing is as follows:
   1. Filter out certain headers and tags in the lyrics
   2. Apply POS tagger
   3. Binary encode True/False labels 
   4. Appy TF-IDF with unigrams, bigrams and trigrams
   5. Create training and testing sets 
   
To start I'll load in the data. 

In [2]:
data = pd.read_csv('../data/lyrics_data.csv')
print(data.shape)
data.head(3)

(27057, 8)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,album_genre,genre,artist,title,explicit_label,lyrics
0,0,0,Folk Rock,,Jim Croce,Top Hat Bar And Grille,False,"Well, if you're lookin' for a good time\nLook ..."
1,1,1,Pop,,Aloe Blacc,The Hand Is Quicker,False,The hand is quicker than the eye\nAnd sometime...
2,2,2,Pop,"[""Pop rock""]",Lionel Richie,Dancing On The Ceiling,False,(Cheering)\nWhoo!\n\nWhat is happening here\nS...


In [3]:
data = data[data['explicit_label'].notna()]
print(data.shape)

(27057, 8)


### Filter Sections and Headers in Lyrics

There are a few key things that have to be done in order to preprocess the lyrics data. As per an article on [genius](https://genius.com/Genius-song-sections-and-headers-guide-annotated) , there are several sections and headers a song can have. Examples of this can be **\[Chorus\]** or **\[Intro\]**.

These sections are labelled in the lyrics are part of the output from the Genius API. These have to be filtered out, as these section and header annotations are not part of the original song lyrics.

In addition to removing the sections and headers, I will also remove unnecesary line breaks, tags at the end of the text and any non-english characters.

In [4]:
def preprocess_lyrics(lyrics):
    if type(lyrics) is not str:
        lyrics = ""
        return lyrics
    else:
        lyrics = re.sub(r'[^a-zA-Z0-9: +=*&^%$#@!~`";:?/\\<>.,\-_\]\[\'\n]', '', lyrics) # remove any non english characters
        lyrics = re.sub(r'[0-9]', '', lyrics) # remove numbers
        lyrics = re.sub(r"[a-zA-Z0-9 -:]*]", "", lyrics) # remove song sections
        lyrics = re.sub(r"\[", "", lyrics) # remove bracket
        lyrics = re.sub(r"[\n]{1,}", ". ", lyrics) # remove multiple newline characters with just a single line
        lyrics = re.sub(r"[0-9]*EmbedShare URLCopyEmbedCopy", "", lyrics) # remove tag at the end

    return lyrics

In [5]:
data['lyrics'] = data['lyrics'].apply(preprocess_lyrics)

In [6]:
data = data[['artist', 'title', 'lyrics', 'explicit_label']]

In [7]:
data.head(3)

Unnamed: 0,artist,title,lyrics,explicit_label
0,Jim Croce,Top Hat Bar And Grille,"Well, if you're lookin' for a good time. Look ...",False
1,Aloe Blacc,The Hand Is Quicker,The hand is quicker than the eye. And sometime...,False
2,Lionel Richie,Dancing On The Ceiling,Cheering. Whoo!. What is happening here. Somet...,False


## POS Tagging

In order to gain more information on the content of the lyrics, I am going to use a Part of Speach Tagger. 

In [8]:
nlp = spacy.load('en_core_web_lg')

In [9]:
def lyric_to_pos(lyric):
    doc = nlp(lyric)
    
    # get the pos and append it to the token
    pos_lyrics = [token.text + '_' + token.tag_ for token in doc]
    return ' '.join(pos_lyrics)

In [10]:
start_time = time.time()
# apply pos tagger
pos_lyrics = data['lyrics'].apply(lyric_to_pos)
end_time = time.time()
print("Execution time: %s min" % ((end_time - start_time)/60))

Execution time: 67.26693824927013 min


In [11]:
data['pos_lyrics'] = pos_lyrics

In [12]:
data.head(3)

Unnamed: 0,artist,title,lyrics,explicit_label,pos_lyrics
0,Jim Croce,Top Hat Bar And Grille,"Well, if you're lookin' for a good time. Look ...",False,"Well_UH ,_, if_IN you_PRP 're_VBP lookin_JJ '_..."
1,Aloe Blacc,The Hand Is Quicker,The hand is quicker than the eye. And sometime...,False,The_DT hand_NN is_VBZ quicker_JJR than_IN the_...
2,Lionel Richie,Dancing On The Ceiling,Cheering. Whoo!. What is happening here. Somet...,False,Cheering_VBG ._. Whoo_UH !_. ._. What_WP is_VB...


## Encode Labels

Next, I will simply encode the `True`/`False` labels to be either 0 or 1. I will do this using the `LabelEncoder` from sklearn. 

In [13]:
le = LabelEncoder()
le.fit(data['explicit_label'])
y = pd.Series(le.transform(data['explicit_label']))
data['explicit_label'] = y

## TF-IDF

The last major step will be convert the data into some numerical format that I can feed into a ML model To do this, I will use `TfidfVectorizer` from sklearn. I will also use unigrams, bigrams and trigrams and set the maximum number of features to 15000.

In [14]:
start_time = time.time()
tfidf = TfidfVectorizer()
tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1,3))
X = tfidf.fit_transform(data['pos_lyrics']).toarray()
X = pd.DataFrame(X, columns=tfidf.get_feature_names())
end_time = time.time()
print("Execution time: %s min" % ((end_time - start_time)/60))
X.head()

Execution time: 1.4844204346338907 min


Unnamed: 0,__dt,__nn,__nn __nn,__nn __nn __nn,__sp,__sp __sp,__sp __sp __sp,__sp _fw,__sp _hyph,__sp _hyph int_nnp,...,z_nnp,zaman_nnp,zau_nnp,zau_nnp al_nnp,zau_nnp al_nnp _hyph,zed_nnp,zero_cd,zoe_nnp,zone_nn,zui_nnp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.041529,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Create Training and Testing Sets

The only thing left to do is to create the training and testing sets. I will use `train_test_split` to accomplish this and do a 70-30 split.

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [16]:
X_train.to_csv('../data/X_train.csv')
X_test.to_csv('../data/X_test.csv')
y_train.to_csv('../data/y_train.csv')
y_test.to_csv('../data/y_test.csv')