# Analysis of songs and their lyrics

## Installation and loading of libraries


In [1]:
import os
import spacy
import unidecode
import collections

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

2024-05-17 11:19:04.286212: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-17 11:19:05.722513: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-05-17 11:19:05.722658: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2024-05-17 11:19:07.438017: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcu

# Data and sources


## Data about songs
Between 1848 and ca. 1914 typographical associations created booklets with lyrics of songs they sang during feasts they organized. The dataset contains a table (in CSV) with an overview of all the songs in the booklets between 1848 and 1870, with among others title, year and writer.

In [2]:
liedjesDF = pd.read_csv("data/liedjes.csv", dtype={'jaartal': 'Int32'})
liedjesDF = liedjesDF.sort_values(by=['songID'])

print("Number of songs:    " + str(len(liedjesDF)))
print("Number of booklets:  " + str(len(liedjesDF['sourceID'].unique())))

Number of songs:    771
Number of booklets:  64


## Song lyrics from files
Besides the overview of the songs in a CSV-file, for every song there is a machine readable representation of the lyrics. We use the following functions to process them.

In [3]:
def getlistOfFilenames(rootdir):
    # input: rootdir: directory with (subdirectory with) TXT-files to be handled
	# output: list of TXT-files(+path) lexicographically ordered on path-name

    files_all = []
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            if not file.endswith('.txt'):
                continue
            fn = os.path.join(subdir, file)
            files_all.append(fn)

    files_all = sorted(files_all)
    return files_all

def getlistOfTexts(listOfFilenames):
	# input: listOfFilenames: list of TXT-files(+path) lexicographically ordered on path-name
    # output: list of texts

	texts = []
	for file in listOfFilenames:
		with open(file) as stream:
			text = stream.read()
		texts.append(text)

	return texts

def lemmatize(listOfTexts, select = True, allowed_postags=["NOUN", "ADJ", "ADV", "VERB"]):   
    # input: listOfTexts: list of Dutch texts
    # input: allowed_postags: list of wordtypes to be kept in the lemmatization process
    # output: list of lemmatized Dutch texts (list of lists of words). Lemmatization by spaCy.

    nlp = spacy.load("nl_core_news_sm") # create spaCy processor named 'nlp' based on small model for Dutch

    result = []                                                         
    for text in listOfTexts:                                                     

        nlp.max_length = len(text)  
        doc = nlp(text) # tokenize, lemmatize and annotate 'text' with processor named 'nlp'

        new_text = []
        for token in doc: 
            if token.is_alpha: # keep tokens with alphanumerical characters (so no numbers or punctuation)
                if not token.is_stop: # remove stopwords
                    if select:
                            if token.pos_ in allowed_postags: # keep wordtypes in the allowed_postags list
                                new_text.append(unidecode.unidecode(token.lemma_)) # get the word in the lemma and add it to the list of words
                    else: new_text.append(unidecode.unidecode(token.lemma_))

        result.append(" ".join(new_text)) # add text to the list of lemmatized texts

    return result
    
def replace(listOfWords, replaceWords):
    # input: listOfWords: list 
    i = 0
    for word in listOfWords:
        if word in replaceWords:
            listOfWords[i] = replaceWords[word]
        i = i + 1

    return listOfWords

def remove(listOfWords, removeWords):
    i = 0
    for word in listOfWords:
        if word in removeWords:
            listOfWords.pop(i)
        i = i + 1

    return listOfWords

def preprocess(listOfTexts, replaceDict, removeList):
    result = []
    for liedje in listOfTexts:
        liedje = liedje.lower().split()
        preprocessedLiedje = remove(replace(liedje, replaceDict), removeList)
        result.append(" ".join(preprocessedLiedje))

    return result

We use the above functions to read the lyrics.

In [4]:
liedjesFilenames    = getlistOfFilenames('data/lyrics')
liedjes             = getlistOfTexts(liedjesFilenames)
lemmatizedLiedjes   = lemmatize(liedjes, select = False)

replaceDict = {"koster":"coster", "kosters":"costers", "vreugd":"vreugde", "blijd":"blijde"}
removeList  = ["ha", "deez", "zoo", "hoezee", "tra", "la", "li", "eene", "gaan", "komen", "laten", "weer", "vinden", "uw", "staan", "waarmee", "immer", "t", "d", "wijze", "hurah"]

preprocessedLiedjes = preprocess(lemmatizedLiedjes, replaceDict, removeList)

To investigate whether our code has worked, we look at the data of song with number ```n```.

In [5]:
n = 200

print("-- data: --")
print(liedjesDF.iloc[n])
print("-- path: --")
print(liedjesFilenames[n])
print("-- song: --")
print(liedjes[n])
print("-- lemmatized song: --")
print(lemmatizedLiedjes[n])
print("-- preprocessed song: --")
print(preprocessedLiedjes[n])


-- data: --
typoID                                      amsterdam1857-1
sourceID                   amsterdam1857-1-feestbundel1862b
songID                  amsterdam1857-1-feestbundel1862b-06
titel                                       Typographenlied
wijze                    Eens werd er aan de zeeuwsche kust
jaartal                                                1862
schrijver                                         I. Poster
vereniging_schrijver                        amsterdam1857-1
Name: 264, dtype: object
-- path: --
data/lyrics/amsterdam1857-1/amsterdam1857-1-feestbundel1862b/amsterdam1857-1-feestbundel1862b-06.txt
-- song: --
Typographen-lied

Eens zag men in de Spaarnestad
Een groot genie verrijzen ;
Dat 't nakroost hem ook niet vergat,
Hiervan zijn de bewijzen.
Hij spreidde op de aard het grootste licht ;
Een standbeeld werd hem opgerigt.
Zijn naam zal eeuwig leven,
Zijn geest ons steeds omzweven.
Geen Tromp, de Ruijter, zelfs hoe groot,
Of welken held men noemt,
Geen die zoo

# TF-IDF



In [6]:
def listify(listOfTexts):
    # input: list of strings
    # output: list of list of strings ('words')
    result = []
    for liedje in listOfTexts:
        liedje = liedje.split()
        result.append(liedje)

    return result


def extract_vocabulary(tokenized_corpus):
    # Result: list of unique words derived from a list of lists of strings
    vocabulary = []
    for word_list in tokenized_corpus:
        for word in word_list:
            if word not in vocabulary:
                vocabulary.append(word)

    return sorted(vocabulary)


def count_words(vocabulary, tokenized_corpus):
	# result: dictionary with word count per word from the vocabulary in the tokenized corpus.

	# init
    count_dict = {}
    for word in vocabulary:
        count_dict[word] = 0

	# count
    for text_list in tokenized_corpus:
        for word in text_list:
            count_dict[word] = count_dict[word] + 1

    return count_dict


In [7]:
# cut up all Liedjes into lists of separate strings ('words')
listifiedLiedjes = listify(preprocessedLiedjes)

# generate vocabulary
liedjesVocabulary = extract_vocabulary(listifiedLiedjes)

# create dict with number of occurances per word
count = count_words(liedjesVocabulary, listifiedLiedjes)

# check the number of occurances of the word 'coster' in the corpus
count['coster']


867

In [8]:
def term_frequency(document, word):
    N = len(document)
    occurance = len([token for token in document if token == word])

    return occurance / N


def inverse_document_frequency(tokenized_corpus, word, vocabulary):
    word_count = count_words(vocabulary, tokenized_corpus)
    try:
        word_occurance = word_count[word] + 1
    except:
        word_occurance = 1

    return np.log((len(tokenized_corpus) + 1) / word_occurance)


def tf_idf(tokenized_corpus, document, word, vocabulary):
    tf = term_frequency(document, word)
    idf = inverse_document_frequency(tokenized_corpus, word, vocabulary)
    tf_idf = tf * idf

    return tf_idf

In [9]:
# term_frequency(listifiedLiedjes[200], 'blij')
# inverse_document_frequency(listifiedLiedjes, 'blij', liedjesVocabulary)
# tf_idf(listifiedLiedjes, listifiedLiedjes[200], 'coster', liedjesVocabulary)


In [10]:
def get_idf_vocabulary(tokenized_corpus, vocabulary):
    idf_vocabulary = {}
    for word in vocabulary:
        idf = inverse_document_frequency(tokenized_corpus, word, vocabulary)
        idf_vocabulary[word] = idf

    return idf_vocabulary

In [11]:
idf_voc = get_idf_vocabulary(listifiedLiedjes, liedjesVocabulary)
sorted_idf_voc = dict(sorted(idf_voc.items(), key=lambda x: x[1]))


In [12]:
count = 0
for key, value in sorted_idf_voc.items():
    if count < 250:
        print(key, value)
        count += 1


coster -0.11720716463557396
vreugde 0.14170683763976435
feest 0.24375609199393478
kunst 0.49625185532067184
zingen 0.5005162541071294
jaar 0.541961662282522
eer 0.5736385189360921
vrolijk 0.6087298387473621
komt 0.6328273903264227
geest 0.6352693939819745
leven 0.7167393625767655
o 0.74635121662341
blij 0.7939126278223491
hart 0.8498918955642503
zien 0.9156432731270308
lied 0.9929927392049235
dag 0.9964953697561256
drukkunst 1.0106298806910303
heil 1.0505625910264014
vieren 1.0542731704229371
kopperfeest 1.065488241243077
vriendschap 1.0960249651031588
blijven 1.1235316108929922
goed 1.1235316108929922
broeder 1.1600468238680899
drinken 1.168345626682785
schoon 1.1809244088896451
vriend 1.2508218485070233
costers 1.2553570036724147
licht 1.3066502980599652
welkom 1.3066502980599652
roem 1.3506671834767394
klinken 1.360717519330241
geven 1.4072375349651338
kring 1.4340487924157908
zang 1.4340487924157908
genoegen 1.4560276991345658
blijde 1.4672009997326911
naam 1.5014900732113232
hand 