# Feature Extraction: Standard Text Features

We extract here features from lyrics for a simple view and for a structered view.

In [1]:
# general
import pandas as pd
import numpy as np
import os
import sys
import re
import dill
from tqdm import tqdm

# tokenization of lyrics
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
nlp = spacy.load("en_core_web_trf")
nlp_blob = spacy.load("en_core_web_trf")
nlp_blob.add_pipe('spacytextblob')

# for a structered view
from sklearn.feature_extraction.text import CountVectorizer

# to get sentiment of individual words
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
path_to_lyrics = ''
path_to_save = ''

## Paths to lyrics

In [2]:
lyrics = path_to_lyrics
lyrics_directory = os.listdir(lyrics)

In [3]:
file_paths_lyrics = []
for file_name in lyrics_directory:
    # Create the full file path using os.path.join()
    file_path = os.path.join(lyrics, file_name)
    
    # Add the file path to the list
    file_paths_lyrics.append(file_path)
    
    
def extract_numeric_part(file_path):
    return [int(s) for s in os.path.basename(file_path).split('.') if s.isdigit()][0]

file_paths_lyrics = sorted(file_paths_lyrics, key=extract_numeric_part)
file_paths_lyrics = [file_path for file_path in file_paths_lyrics if not 'Zone.Identifier' in file_path]

## Cleaning

### List of all instances of the word 'chorus'

In [4]:
chorus = []
for file_path in file_paths_lyrics:
    with open(file_path, 'r') as file:
        for line in file:
            lyrics_line = line.strip('\n')
            if "chorus" in (lyrics_line).lower():
                chorus_tmp = lyrics_line.replace('[', '\\[')
                chorus_tmp = chorus_tmp.replace(']', '\\]')
                chorus_tmp = chorus_tmp.replace('*', '\\*')
                chorus_tmp = chorus_tmp.replace('(', '\\(')
                chorus_tmp = chorus_tmp.replace(')', '\\)')
                chorus.append(chorus_tmp)

In [5]:
chorus = np.unique(chorus)

In [6]:
chorus[6] = 'Chorus: '
chorus[7] = 'Chorus: ' 
chorus[8] = 'Chorus: '
chorus[11] = 'Chorus: '
chorus[23] = '\\*chorus starts\\* '
chorus[38] = '\\[Chorus\\] '

In [7]:
chorus = [chorus[i].lower() for i in range(len(chorus))]

In [8]:
chorus = np.unique(chorus)

### List of [repeat] and other comments

In [15]:
repetition_styles = ['[(\\[]*sing [0-9]+x[)\\]]*', '[(\\[]*sing x[0-9]+[)\\]]*',
                     '[(\\[]*repeat [0-9]+x[)\\]]*', '[(\\[]*repeat x[0-9]+[)\\]]*',
                     '[(\\[]*[0-9]+x[)\\]]*','[(\\[]*x[0-9]+[)\\]]*',
                     '[(\\[]+.*repeat.*[)\\]]+',
                     '\\(scat singing + title, x4\\)',
                     '\\(repeat to fade',
                     'coda \\[repeat to fade over trash organ riff from intro\\]: ',
                     'repeat \\(\\*\\)',
                     'repeat 3 times',
                     'repeat to fade',

                     '[(\\[]+.*solo.*[)\\]]+',
                     '[(\\[]+.*break.*[)\\]]+',
                     '[(\\[]+.*instrumental.*[)\\]]+',
                     '[(\\[]+.*spoken.*[)\\]]+',
                     '[(\\[]+.*guitar.*[)\\]]+',
                     '[(\\[]+.*whistle.*[)\\]]+',
                     '[(\\[]+.*noises.*[)\\]]+',
                     '[(\\[]+.*chanting.*[)\\]]+',
                     '[(\\[]+.*lyrics.*[)\\]]+',
                     '[(\\[]+.*verse.*[)\\]]+',
                     '[(\\[]+.*outro.*[)\\]]+',
                     '[(\\[]+.*interlude.*[)\\]]+',
                     '\\[Hook\\]', '\\[DMX\\]', '\\[LL\\]',
                     '[(\\[]+', '[)\\]]+'
                     ]

### Preparing dataset

Delete 'chorus' and 'reapeat' etc, delete stop words

Split into lines and verses

In [16]:
def delete_the_longest_matching(text, substrings):
    all_matching = [re.findall(substrings[i], text)[0] for i in range(len(substrings)) if re.search(substrings[i], text) != None]
    if len(all_matching) == 0:
        return text, False
    len_all_matching = [len(all_matching[i]) for i in range(len(all_matching))]
    the_longest_matching = np.argmax(len_all_matching)
    return text.replace(all_matching[the_longest_matching], ''), True

In [17]:
stop_words_part_of_speech = ['DET', 'PRON', 'CONJ', 'CCONJ', 'ADP', 'PUNCT']

def one_song_lyrics(file_path):
    
    lyrics = []
    lyrics_tokenization = []
    verse_ind = []
    line_ind = []
    verse_num = 0
    line_num = 0
    prev_verse_num = False

    with open(file_path, 'r') as file:
        for line in file:
            lyrics_line = line.strip('\n').lower()

            lyrics_line, is_chorus = delete_the_longest_matching(lyrics_line, chorus)
            lyrics_line, _ = delete_the_longest_matching(lyrics_line, repetition_styles)
            
            if (len(lyrics_line.lower()) == 0 or lyrics_line.lower() == '\n'  or lyrics_line.lower() == ' ' or is_chorus):
                if not prev_verse_num:
                    verse_num += 1
                    prev_verse_num = True
            else:
                prev_verse_num = False
                verse_ind.append(verse_num)
                line_ind.append(line_num)
                line_num += 1
                lyrics.append(lyrics_line)
                nlp_n = nlp(lyrics_line)
                nlp_n = [nlp_n_w for nlp_n_w in nlp_n if not nlp_n_w.pos_ in stop_words_part_of_speech]
                lyrics_tokenization.append(' '.join([nlp_n_w.lemma_ for nlp_n_w in nlp_n]))

    df = pd.DataFrame({
        'verse': verse_ind,
        'line': line_ind,
        'lyrics': lyrics,
        'tokenized_lyrics': lyrics_tokenization
    })

    return df

In [18]:
len(file_paths_lyrics)

764

In [19]:
df = None
for file_path in tqdm(file_paths_lyrics):
    df_tmp = one_song_lyrics(file_path)
    df_tmp[['file_path']] = file_path
    df_tmp[['song_id']] = re.findall('[0-9]+', file_path)
    df = pd.concat([df_tmp, df])
    
df = df[::-1]
df = df.reset_index()

100%|██████████| 764/764 [17:55<00:00,  1.41s/it]


In [20]:
df.to_csv('data/text_modality/preprocessed_lyrics.csv')

## Extract features for a simple view

In [None]:
df = pd.read_csv(os.path.join(path_to_save,'preprocessed_lyrics.csv'), dtype=str, index_col=0)

In [119]:
def extract_features(df):

    song_ids = np.unique(df['song_id'])
    df_res=pd.DataFrame()

    for song_num in tqdm(range(len(song_ids))):
        df_tmp = df[df['song_id'] == song_ids[song_num]]
        tokenized_lyrics_join = ' '.join(df_tmp['tokenized_lyrics'].dropna())
        tokenized_lyrics_join_split = tokenized_lyrics_join.split(' ')
        word_token_len = [len(tokenized_lyrics_join_split[i]) for i in range(len(tokenized_lyrics_join_split))]
        lyrics_join = ' '.join(df_tmp['lyrics'])
        lyrics_join_split = lyrics_join.split(' ')

        line_num = len(df_tmp['lyrics'])
        word_num = len(lyrics_join_split)
        verse_num = np.unique(df_tmp['verse']).shape[0]
        word_token_num = len(tokenized_lyrics_join_split)
        word_token_unique_num = len(np.unique(tokenized_lyrics_join_split))
        word_token_mean_len = np.mean(word_token_len)
        word_token_max_len = np.max(word_token_len)
        word_token_std_len = np.std(word_token_len)

        stop_words_part_of_speech = ['DET', 'PRON', 'CONJ', 'CCONJ', 'ADP', 'PUNCT']
        nlp_n = nlp_blob(lyrics_join)
        stop_words_song = [nlp_n_w.pos_ for nlp_n_w in nlp_n if nlp_n_w.pos_ in stop_words_part_of_speech]
        stop_words_counts = [stop_words_song.count(stop_words_part_of_speech[i]) for i in range(len(stop_words_part_of_speech))]

        polarity = nlp_n._.blob.polarity
        subjectivity = nlp_n._.blob.subjectivity

        df_res_tmp = pd.DataFrame({
            'lines': [line_num],
            'words': [word_num],
            'verses': [verse_num],
            'words_token': [word_token_num],
            'unique_words_token': [word_token_unique_num],
            'length_mean_words_token': [word_token_mean_len],
            'length_max_words_token': [word_token_max_len],
            'length_mstd_words_token': [word_token_std_len],
            'polarity': polarity,
            'subjectivity': subjectivity,
            'DET': stop_words_counts[0],
            'PRON': stop_words_counts[1],
            'CONJ': stop_words_counts[2],
            'CCONJ': stop_words_counts[3],
            'ADP': stop_words_counts[4],
            'PUNCT': stop_words_counts[5]
        })

        df_res = pd.concat([df_res, df_res_tmp], ignore_index = True)
    
    return df_res

In [120]:
df_res = extract_features(df)

100%|██████████| 764/764 [07:51<00:00,  1.62it/s]


In [121]:
df_res.to_csv(os.path.join(path_to_save,'preprocessed_lyrics_simpleview.csv'))

## Prepare structered view

In [11]:
df = pd.read_csv(os.path.join(path_to_save,'preprocessed_lyrics.csv'), dtype=str, index_col=0)

In [12]:
ind_nan = df['tokenized_lyrics'].isnull().values
df = df.loc[~ind_nan,:]
tokenized_lyrics = df['tokenized_lyrics']

In [13]:
# list of words that appeared at least 20 times
min_df0 = 20
vectorizer = CountVectorizer(min_df=min_df0, stop_words='english')
X = vectorizer.fit_transform(tokenized_lyrics)
dictionary_text_modality = vectorizer.get_feature_names_out()

In [14]:
# 577 unique words
X.shape

(26287, 577)

In [101]:
# save vocabulary - 577 words
np.savetxt(os.path.join(path_to_save,'vocabulary_577.txt'), dictionary_text_modality, fmt="%s")

In [17]:
# save ids of songs with lyrics
song_ids = [str(i)[1:4] for i in range(1001, 1904)]
df_is_lyrics = {
    'song_id': song_ids,
    'exists': [song_ids[i] in np.sort(df['song_id'].unique()) for i in range(len(song_ids))]
}
pd.DataFrame(df_is_lyrics).to_csv(os.path.join(path_to_save,'lyrics_exist.csv'))

## Data points (sentences) consist of words in lines

In [21]:
structered_view = [X[df['song_id'] == song_ids[i]].toarray() for i in range(len(song_ids))]

In [22]:
# delete empty samples
structered_view = [structered_view[i] for i in range(len(structered_view)) if structered_view[i].shape[0] > 0]

In [23]:
# delete empty data points
structered_view = [structered_view[i][np.sum(structered_view[i], axis=1) > 0,:] for i in range(len(structered_view))]

In [24]:
# chceck how many samples with small numbers of data points
np.sum(np.array([structered_view[i].shape[0] for i in range(len(structered_view))]) < 5)

4

In [25]:
# the number of data points
np.vstack(structered_view).shape

(22564, 577)

In [115]:
with open(os.path.join(path_to_save,'sentences_lines_577.pkl'), 'wb') as f:  
    dill.dump(structered_view, f)

## Vocabulary - sentiment of words

In [150]:
vocab = pd.read_csv(os.path.join(path_to_save,'vocabulary_577.txt'), header=None)
vocab = np.array(vocab[[0]])
vocab = vocab.flatten().tolist()

In [151]:
sid = SentimentIntensityAnalyzer()

In [154]:
# get the sentiment of individual words
neg = [sid.polarity_scores(vocab[i])['neg'] for i in range(len(vocab))]
pos = [sid.polarity_scores(vocab[i])['pos'] for i in range(len(vocab))]
com = [sid.polarity_scores(vocab[i])['compound'] for i in range(len(vocab))]

In [160]:
df_sentiment = pd.DataFrame({'vocab': vocab,
                             'negative': neg,
                             'positive': pos,
                             'compound': com})

In [156]:
# get part of speech of individual words
part_of_speech = [nlp_blob(vocab[i])[0].pos_ for i in range(len(vocab))]
df_pos = pd.get_dummies(part_of_speech)

In [185]:
# merge and save
df_features = pd.concat([df_sentiment, df_pos], axis=1)
df_features.to_csv(os.path.join(path_to_save,'lyrics_vocab_description_577.csv'))