In [68]:
import numpy as np
import pandas as pd

import os
import re

import nltk
from nltk.corpus import gutenberg, wordnet, stopwords
from nltk.tokenize import sent_tokenize , word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation

nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package omw-1.4 to /Users/jacob/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/jacob/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [98]:
# function to lemmatize

lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_to_wordnet(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
# list with stopwords and punctuation to remove + manually created list of non-meaningful words
insignificant_words = ['embed', 'likeembed','might','also','like','lyric','know','go','say','oh',
                       'ooh','get','well','come','make','one', 'yeah', 'ay','ai','see',
                       'take','na','ca','let','tell','gon','wan',"``",'...', "'s'","n't", "'m'", "'cause'"]
stoplist = set(stopwords.words('english') + list(punctuation) + insignificant_words)

def clean_lyrics(lyrics):
    # change everything to lower case
    lyrics = lyrics.lower()
    # remove numbers
    lyrics_nonum = re.sub(r'\d+', '', lyrics)
    
    #tokenize the lyrics and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(lyrics_nonum))  
    
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_to_wordnet(x[1])), nltk_tagged)
    lemmatized_lyrics = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_lyrics.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_lyrics.append(lemmatizer.lemmatize(word, tag))
            
    unique_tokens = unique_tokens = list(set(lemmatized_lyrics))
    
    # remove stopwords
    unique_nostop = [word for word in unique_tokens if word not in stoplist]
    unique_nostop = [word for word in unique_nostop if not "'" in word]
    return unique_nostop
    

In [103]:
path_songs = "snippets/songs_2022"
songlist = [song for song in os.listdir(path_songs) if os.path.isfile(os.path.join(path_songs, song))]
songlist

['7 Days by YoungBoy Never Broke Again.txt',
 'Gravity by Brent Faiyaz.txt',
 'No Switch by YoungBoy Never Broke Again.txt',
 'City Of Gods by Fivio Foreign Kanye West.txt',
 'High by The Chainsmokers.txt',
 'Johnny Ps Caddy by Benny The Butcher.txt',
 'Tequila Little Time by Jon Pardi.txt',
 'Take My Name by Parmalee.txt',
 'Little Freak by Harry Styles.txt',
 'Daydreaming by Harry Styles.txt',
 'Open Arms by SZA.txt',
 'Euthanasia by Post Malone.txt',
 'A Holly Jolly Christmas by Burl Ives.txt',
 'Despecha by Rosalia.txt',
 'Knowing You by Kenny Chesney.txt',
 'Smoke One by YoungBoy Never Broke Again.txt',
 'Vette Motors by YoungBoy Never Broke Again.txt',
 'In My Head by Lil Tjay.txt',
 'Umbrella by Metro Boomin 21 Savage.txt',
 'In The Stars by Benson Boone.txt',
 'Not Finished by Lil Baby.txt',
 'Cold December by Rod Wave.txt',
 'IDK That Bitch by Gunna.txt',
 'Poison by Jack Harlow.txt',
 '7500 OBO by Tim McGraw.txt',
 'Banking On Me by Gunna.txt',
 'Rich Spirit by Kendrick Lamar

In [104]:
df_songs = pd.DataFrame(columns = ['song_name','artist'])
song_lyrics = []

for song in songlist:    
    # getting title and artist from the file name
    title_artist = pd.DataFrame(song[:-4].split('by',1))
    title_artist = title_artist.transpose()
    title_artist.columns = ['song_name','artist']
        
    # clean lyrics
    source = open(os.path.join(path_songs, song), 'r', encoding='cp1252')
    lyrics = source.read()
    clean_tokens = clean_lyrics(lyrics)
    lyrics_string = ' '.join(clean_tokens)
    song_lyrics.append(lyrics_string)
    
    # add to the dataframe
    df_songs = pd.concat([df_songs, title_artist])

df_songs['filename'] = songlist
df_songs['lyrics'] = song_lyrics

df_songs.head()

Unnamed: 0,song_name,artist,filename,lyrics
0,7 Days,YoungBoy Never Broke Again,7 Days by YoungBoy Never Broke Again.txt,window hand m.i.a use lord money love car back...
0,Gravity,Brent Faiyaz,Gravity by Brent Faiyaz.txt,use globe-trottin status afar back star nestin...
0,No Switch,YoungBoy Never Broke Again,No Switch by YoungBoy Never Broke Again.txt,window shawty money split put bad track haha c...
0,City Of Gods,Fivio Foreign Kanye West,City Of Gods by Fivio Foreign Kanye West.txt,option money soft never goon hype love drip ch...
0,High,The Chainsmokers,High by The Chainsmokers.txt,never somebody love good lead drunk instead pl...


In [48]:
def get_most_common(df):
    #df = pd.read_csv(f"songs_cleaned/songs_{year}.csv", sep=";")
    words = []
    for i, lyrics in enumerate(df["lyrics"]):
        for word in lyrics.split(" "):
            # print(word)
            words.append(word)
    words, counts = np.unique(words, return_counts=True)
    words = [x for _, x in sorted(zip(counts, words))][::-1]
    counts = sorted(counts)[::-1]
    # for i in range(len(words)):
    counter = 0
    for i in range(len(words)):
        if not "'" in words[i]:
            print(f"{words[i]:20} {counts[i]}")
            counter += 1

In [82]:
get_most_common(df_songs)

time                 357
love                 339
want                 330
back                 318
never                297
need                 282
keep                 279
shit                 277
fuck                 272
baby                 268
way                  267
give                 261
bitch                257
feel                 254
nigga                250
think                245
good                 236
still                230
right                225
could                222
put                  220
life                 216
look                 207
leave                207
call                 199
every                197
girl                 193
even                 193
night                192
ta                   183
thing                181
try                  180
new                  180
day                  176
would                174
tryna                174
man                  174
run                  166
hit                  164
likeembed            161


bomb                 7
bob                  7
board                7
blitz                7
beauty               7
bajo                 7
backseat             7
ash                  7
anytime              7
anger                7
anda                 7
amiri                7
algo                 7
youngins             6
ying                 6
yeahyou              6
wound                6
wock                 6
within               6
welcome              6
wearin               6
vvs                  6
vine                 6
video                6
vice                 6
umbrella             6
u                    6
turbo                6
truly                6
touchoftrent         6
totin                6
topic                6
todos                6
tipsy                6
tiny                 6
timeembed            6
tim                  6
tiffany              6
thumb                6
thou                 6
thotty               6
thirteen             6
therapy              6
thanks     

obama                3
nver                 3
nowyou               3
nowadays             3
novio                3
november             3
notch                3
nosey                3
nosebleed            3
noose                3
noodle               3
nod                  3
noches               3
nineteen             3
niece                3
nickname             3
newborn              3
nevr                 3
negotiate            3
necesita             3
navidad              3
nauseous             3
nation               3
nardo                3
napkin               3
nan                  3
na-na-na-na          3
na-na-na             3
na-na                3
musty                3
musta                3
muse                 3
muscle               3
murderer             3
multiple             3
muchas               3
mouse                3
motto                3
moreembed            3
morality             3
morale               3
monkey               3
moja                 3
mockery    

discover             2
disbelief            2
directly             2
ding-dong-ding       2
dimelo               2
digress              2
digas                2
differently          2
diet                 2
diego                2
diddy                2
diary                2
diaper               2
dexter               2
deville              2
device               2
deutsche             2
detroit              2
detention            2
destino              2
destine              2
dessert              2
despise              2
desk                 2
depression           2
depart               2
denial               2
demo                 2
delta                2
delete               2
deleon               2
dejando              2
degrade              2
definition           2
definite             2
deere                2
deed                 2
dedication           2
dedicate             2
debo                 2
debajo               2
dearly               2
dean                 2
deadass    

thalia_jamess        1
thalia               1
thaiturkceshut       1
thaisvenskaespanolhrvatskiportuguesitalianomagyardeutschfrancaispolskitagalog 1
thailand             1
thaiitalianofrancaisturkcedeutschnederlandssnow 1
thaiitalianoanti-hero 1
thaii                1
thaiespanolportuguesitalianofrancaisenglishhrvatskiromanizationpink 1
thaied               1
th-these             1
tge.shade.room       1
texts                1
texting              1
texted               1
texaco               1
tether               1
tete                 1
teta                 1
testyou              1
testy                1
testify              1
terzian              1
terrorize            1
terrorist            1
terrified            1
terrific.sims        1
terrier              1
terrible             1
terrace              1
terminamos           1
term                 1
teresa               1
tenth                1
tent                 1
tensa                1
tengas               1
tenfold          

perico               1
perhaps              1
performin            1
perforaremos         1
perfora              1
perdon               1
perdio               1
perdimo              1
perdida              1
perderlo             1
perdemo              1
perdedor             1
perception           1
perceive             1
per                  1
pequena              1
pepsi                1
peppa                1
peor                 1
pent                 1
penon                1
penniless            1
penn                 1
penmanship           1
peng                 1
penetratin           1
penetrate            1
pendosallu           1
pendiente            1
pendejo              1
pencil               1
penant               1
pena                 1
pelo                 1
pelirrojo            1
pelion               1
peligroso            1
pelicula             1
pelan                1
pego                 1
pegate               1
pegas                1
peephole             1
peelin     

huge                 1
huff                 1
huerfanito           1
huella               1
huelerlo             1
huddle               1
huckster             1
huckleberry          1
hubo                 1
hublot               1
hubby                1
hsfe.ue              1
hrself               1
hre                  1
hoyo                 1
howl                 1
housing              1
housin               1
housewife            1
housekeeper          1
houndstooth          1
hotty                1
hottie               1
hots                 1
hostiga              1
hospitality          1
horror.sims          1
horrid               1
horrible             1
horoscopo            1
hornearme            1
horas                1
hopelessness         1
hopeful              1
hope.simss           1
hooray               1
hooper               1
hookin               1
hooker               1
hooked               1
hoodies              1
hoochie-coochie      1
hong                 1
honeyxx._.a

distributin          1
distrae              1
distraction          1
distante             1
dissing              1
disruptors           1
disregardin          1
disregard            1
disputin             1
dispuesto            1
disposition          1
disposable           1
disparamos           1
disown               1
disobey              1
disney               1
dismissive           1
disimular            1
dishonest            1
disgusting           1
disfrutando          1
disfruta             1
disfrazaste          1
disease              1
discuss              1
discredit            1
discoverin           1
discouraged          1
discourage           1
discotheque          1
discoteca            1
disconnected         1
disapprove           1
disappointment       1
disagree             1
disability           1
dirtembed            1
diron                1
diria                1
director             1
directo              1
directamente         1
direccion            1
dire       

basicsyou            1
basicsimmer          1
basements            1
basel                1
barti                1
bartend              1
bart                 1
barstool             1
barrosruandriadearaujo 1
barro                1
barrio               1
barriga              1
barre                1
barneys              1
barn                 1
barkin               1
barkeep              1
bark                 1
barge                1
barefoot             1
barea                1
barcode              1
barbwire             1
barbu                1
barbies              1
barbied              1
barber.sims          1
barbeque             1
baptism              1
bape                 1
baowembed            1
banziwedlin          1
banton               1
bang-out             1
bang-bang            1
bandwagon            1
bandos               1
bandera              1
bandana              1
band-aid             1
banbwoi              1
bananza              1
bamboo               1
bam-bam-b

In [73]:
import itertools
import networkx as nx

In [105]:
network = {}
network_key = 0
for index, row in df_songs.iterrows():
    combined_list = [word for word in str.split(row["lyrics"], " ")]
    #itertool product creates Cartesian product of each element in the combined list
    for pair in itertools.product(combined_list, combined_list):
        #exclude self-loops and count each pair only once because our graph is undirected and we do not take self-loops into account
        if pair[0]!=pair[1] and not(pair[::-1] in network):
            network.setdefault(pair,0)
            network[pair] += 1 
    
network_df = pd.DataFrame.from_dict(network, orient="index")

In [106]:
network_df.reset_index(inplace=True)
network_df.columns = ["pair","weight"]
network_df.sort_values(by="weight",inplace=True, ascending=False)
network_df

Unnamed: 0,pair,weight
4212,"(shit, bitch)",207
2808,"(nigga, bitch)",204
2795,"(nigga, shit)",204
4259,"(shit, fuck)",193
836,"(love, time)",188
...,...,...
1639926,"(touchin, lake)",1
1639927,"(touchin, dust)",1
1639933,"(touchin, drac)",1
1639934,"(touchin, save)",1


In [108]:
#to get weighted graph we need a list of 3-element tuplels (u,v,w) where u and v are nodes and w is a number representing weight
up_weighted = []
for edge in network:
    #we can filter edges by weight by uncommenting the next line and setting desired weight threshold
    #if(network[edge])>1:
    up_weighted.append((edge[0],edge[1],network[edge]))

G = nx.Graph()
G.add_weighted_edges_from(up_weighted)

In [109]:
print(len(G.nodes()))
print(len(G.edges()))

15060
3971011


In [111]:
filename = "/edgelist.csv"
nx.write_weighted_edgelist(G, filename, delimiter=",")
#add header with appropriate column names (works on collab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./edgelist.csv

OSError: [Errno 30] Read-only file system: '/edgelist.csv'