In [1]:
import numpy as np
import pandas as pd

import os
import re

import nltk
from nltk.corpus import gutenberg, wordnet, stopwords
from nltk.tokenize import sent_tokenize , word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation

nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\gooog\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gooog\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# function to lemmatize

lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_to_wordnet(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
# list with stopwords and punctuation to remove + manually created list of non-meaningful words
insignificant_words = ['embed', 'likeembed','might','also','like','lyric','know','go','say','oh',
                       'ooh','get','well','come','make','one', 'yeah', 'ay','ai','see',
                       'take','na','ca','let','tell','gon','wan',"``",'...', "'s'","n't", "'m'", "'cause'"]
stoplist = set(stopwords.words('english') + list(punctuation) + insignificant_words)

def clean_lyrics(lyrics):
    # change everything to lower case
    lyrics = lyrics.lower()
    # remove numbers
    lyrics_nonum = re.sub(r'\d+', '', lyrics)
    
    #tokenize the lyrics and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(lyrics_nonum))  
    
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_to_wordnet(x[1])), nltk_tagged)
    lemmatized_lyrics = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_lyrics.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_lyrics.append(lemmatizer.lemmatize(word, tag))
            
    unique_tokens = unique_tokens = list(set(lemmatized_lyrics))
    
    # remove stopwords
    unique_nostop = [word for word in unique_tokens if word not in stoplist]
    unique_nostop = [word for word in unique_nostop if not "'" in word]
    return unique_nostop
    

In [5]:
os.makedirs("songs_cleaned", exist_ok=True)
for year in range(1955, 2023, 1)[::-1]:
    print(year)  
    path_songs = f"songs/songs_{year}"
    songlist = [song for song in os.listdir(path_songs) if os.path.isfile(os.path.join(path_songs, song))]
    songlist
    df_songs = pd.DataFrame(columns = ['title','artist'])
    song_lyrics = []

    for song in songlist:  
        # getting title and artist from the file name
        title_artist = pd.DataFrame(song[:-4].split('by',1))
        title_artist = title_artist.transpose()
        title_artist.columns = ['title','artist']
            
        # clean lyrics
        source = open(os.path.join(path_songs, song), 'r', encoding='cp1252')
        lyrics = source.read()
        clean_tokens = clean_lyrics(lyrics)
        lyrics_string = ' '.join(clean_tokens)
        song_lyrics.append(lyrics_string)
        
        # add to the dataframe
        df_songs = pd.concat([df_songs, title_artist])

    df_songs['filename'] = songlist
    df_songs['lyrics'] = song_lyrics

    # print(df_songs.head())
    df_songs.to_csv(f"songs_cleaned/songs_{year}.csv", sep=";", index=False)

2022
2021
2020
2019
2018
2017
2016
2015
2014
2013
2012
2011
2010
2009
2008
2007
2006
2005
2004
2003
2002
2001
2000
1999
1998
1997
1996
1995
1994
1993
1992
1991
1990
1989
1988
1987
1986
1985
1984
1983
1982
1981
1980
1979
1978
1977
1976
1975
1974
1973
1972
1971
1970
1969
1968
1967
1966
1965
1964
1963
1962
1961
1960
1959
1958
1957
1956
1955


In [None]:
def get_most_common(df):
    #df = pd.read_csv(f"songs_cleaned/songs_{year}.csv", sep=";")
    words = []
    for i, lyrics in enumerate(df["lyrics"]):
        for word in lyrics.split(" "):
            # print(word)
            words.append(word)
    words, counts = np.unique(words, return_counts=True)
    words = [x for _, x in sorted(zip(counts, words))][::-1]
    counts = sorted(counts)[::-1]
    # for i in range(len(words)):
    counter = 0
    for i in range(len(words)):
        if not "'" in words[i]:
            print(f"{words[i]:20} {counts[i]}")
            counter += 1

In [None]:
get_most_common(df_songs)

In [None]:
import itertools
import networkx as nx

In [None]:
network = {}
network_key = 0
for index, row in df_songs.iterrows():
    combined_list = [word for word in str.split(row["lyrics"], " ")]
    #itertool product creates Cartesian product of each element in the combined list
    for pair in itertools.product(combined_list, combined_list):
        #exclude self-loops and count each pair only once because our graph is undirected and we do not take self-loops into account
        if pair[0]!=pair[1] and not(pair[::-1] in network):
            network.setdefault(pair,0)
            network[pair] += 1 
    
network_df = pd.DataFrame.from_dict(network, orient="index")

In [None]:
network_df.reset_index(inplace=True)
network_df.columns = ["pair","weight"]
network_df.sort_values(by="weight",inplace=True, ascending=False)
network_df

In [None]:
#to get weighted graph we need a list of 3-element tuplels (u,v,w) where u and v are nodes and w is a number representing weight
up_weighted = []
for edge in network:
    #we can filter edges by weight by uncommenting the next line and setting desired weight threshold
    #if(network[edge])>1:
    up_weighted.append((edge[0],edge[1],network[edge]))

G = nx.Graph()
G.add_weighted_edges_from(up_weighted)

In [None]:
print(len(G.nodes()))
print(len(G.edges()))

In [None]:
filename = "/edgelist.csv"
nx.write_weighted_edgelist(G, filename, delimiter=",")
#add header with appropriate column names (works on collab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./edgelist.csv