# Read the data

In [1]:
import pandas as pd

from data import DataLoader

songs = DataLoader.load()
df = pd.DataFrame(songs)
df.head()

Unnamed: 0,artist,lyrics,name
0,Trent Tomlinson,It was perfect timin'\nBoth of us findin' what...,A Good Run by Trent Tomlinson
1,Trent Tomlinson,"Twenty-five just yesterday\nPartied too hard, ...",A Man Without A Woman by Trent Tomlinson
2,Trent Tomlinson,The sunbeam hit the Jim Beam\nAnd ricocheted o...,Angels Like Her by Trent Tomlinson
3,Trent Tomlinson,I still show up at five\nLet everybody know I'...,Cheatin On My Honky-Tonk by Trent Tomlinson
4,Trent Tomlinson,Baby I don't care that your hair is soakin' we...,Come back to bed by Trent Tomlinson


# Inspect the data

In [139]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  204 non-null    object
 1   lyrics  204 non-null    object
 2   name    204 non-null    object
dtypes: object(3)
memory usage: 4.9+ KB


# Clean the data

In [140]:
import re
import string
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

def clean(text):
    lowered_no_punc = text.lower()\
        .translate(str.maketrans('', '', string.punctuation))
    no_number_string = re.sub(r'\d+','',lowered_no_punc)
    no_stopwords = ' '.join([w for w in no_number_string.split() if w not in stopwords])
    return no_stopwords.replace('’','')

df['lyrics_cleaned'] = df['lyrics'].apply(clean)
df['words'] = df['lyrics_cleaned'].apply(lambda x: x.split())
df.head()

Unnamed: 0,artist,lyrics,name,lyrics_cleaned,words
0,Trent Tomlinson,It was perfect timin'\nBoth of us findin' what...,A Good Run by Trent Tomlinson,perfect timin us findin needed blind love driv...,"[perfect, timin, us, findin, needed, blind, lo..."
1,Trent Tomlinson,"Twenty-five just yesterday\nPartied too hard, ...",A Man Without A Woman by Trent Tomlinson,twentyfive yesterday partied hard slept late p...,"[twentyfive, yesterday, partied, hard, slept, ..."
2,Trent Tomlinson,The sunbeam hit the Jim Beam\nAnd ricocheted o...,Angels Like Her by Trent Tomlinson,sunbeam hit jim beam ricocheted bottle coke cu...,"[sunbeam, hit, jim, beam, ricocheted, bottle, ..."
3,Trent Tomlinson,I still show up at five\nLet everybody know I'...,Cheatin On My Honky-Tonk by Trent Tomlinson,still show five let everybody know ive arrived...,"[still, show, five, let, everybody, know, ive,..."
4,Trent Tomlinson,Baby I don't care that your hair is soakin' we...,Come back to bed by Trent Tomlinson,baby dont care hair soakin wet drop towel floo...,"[baby, dont, care, hair, soakin, wet, drop, to..."


# Calculate Tf-Idf

In [141]:
from tf_idf import TfIdf

def apply_tfidf(df):
    return TfIdf.compute_tfidf(df['words'].values)

tfidf = df[['words', 'artist']]\
    .groupby('artist').apply(apply_tfidf)
tfidf.head()

artist
Bad Meets Evil    [{'verse': 0.0006426393435547508, 'eminem': 0....
D12               [{'intro': 0.0008600857018970892, 'bizarre': 0...
Daya              [{'verse': 0.00045987427699171, 'rules': 0.005...
Eminem            [{'hit': 0.0056848572667433945, 'em': 0.005684...
Evanescence       [{'shower': 0.010869565217391304, 'dark': 0.01...
dtype: object

# Analyze Tf-Idf

In [142]:
top10 = {} # top10 words for each song for each artist
def sort_by_value(x):
    return dict(sorted(x.items(), key=lambda item: item[1], reverse=True))
for artist in tfidf.index:
    top10[artist] = []
    for song_tfidf in tfidf[artist]:
        top_words = list(sort_by_value(song_tfidf).items())[:5]
        def hide_illegal(w):
            return w.replace('fuck', '@*#!')
        top_words = [hide_illegal(w) for w,_ in top_words]
        top10[artist].append(top_words)
top10

{'Bad Meets Evil': [['yo', 'lesson', 'yall', 'youll', 'breathing'],
  ['poor', 'claret', 'jai', 'rich', 'richer'],
  ['kiss', 'want', 'sucking', 'female', 'nina'],
  ['‘cause', 'moment', 'wake', 'day', 'opponents'],
  ['unavailableembedshare', 'urlcopyembedcopy'],
  ['meets', 'emcees', 'evil', 'scare', 'seuss'],
  ['echo', 'callin', 'follow', 'liz', 'rodrigues'],
  ['pedal', 'livin', 'fast', 'lane', 'metal'],
  ['everything', 'painkillers', 'syrup', 'hennessy', 'cigarette'],
  ['holdin', 'vest', 'second', 'nigga', 'wack']],
 'D12': [['hundred', 'keep', 'kidd', 'riding', 'start'],
  ['waiting', 'artists', 'lyricsembedshare', 'urlcopyembedcopy'],
  ['‘em', 'mase', 'bank', 'moe', 'young'],
  ['pour', 'guzzle', 'club', 'wild', 'mother@*#!er'],
  ['mother@*#!ers', 'sing', 'along', 'words', 'morning'],
  ['dozen', 'rolling', 'dirty', 'leave', 'kids'],
  ['aiyyo', 'anybody', 'yo', '@*#!in', 'play'],
  ['journey', 'psychopath', 'spiller', 'blood', 'mentality'],
  ['@*#!ing', 'psycho', 'breal',

In [143]:
top10_per_artist = {}
for artist in top10:
    cnt = {}
    for song in top10[artist]:
        for word in song:
            cnt[word] = cnt.get(word, 1) + 1
    cnt = sort_by_value(cnt)
    cnt = list(cnt.items())[:10]
    words = [w for w,_  in cnt]
    top10_per_artist[artist] = words

In [145]:
intersection = {}
for artist1 in top10_per_artist:
    intersection[artist1] = {}
    for artist2 in top10_per_artist:
        if artist2 == artist1:
            continue
        x = set(top10_per_artist[artist1])\
            .intersection(set(top10_per_artist[artist2]))
        if len(x) > 0:
            intersection[artist1][artist2] = x
    if len(intersection[artist1].keys()) == 0:
        intersection.pop(artist1, None)
intersection

{'Daya': {'Simon & Garfunkel': {'need'}},
 'Eminem': {'Mr. President': {'ya'}},
 'Evanescence': {'Xzibit': {'new', 'wanted'}},
 'Mr. President': {'Eminem': {'ya'}, 'The Rasmus': {'gonna'}},
 'No Doubt': {'Robbie Williams': {'love'}, 'The Subways': {'love'}},
 'Palastic': {'Simon & Garfunkel': {'burning'}},
 'Robbie Williams': {'No Doubt': {'love'},
  'The Rasmus': {'oh'},
  'The Subways': {'love'}},
 'Simon & Garfunkel': {'Daya': {'need'}, 'Palastic': {'burning'}},
 'Sting': {'The Rasmus': {'every'}},
 'The Rasmus': {'Mr. President': {'gonna'},
  'Robbie Williams': {'oh'},
  'Sting': {'every'},
  'The Subways': {'days'}},
 'The Subways': {'No Doubt': {'love'},
  'Robbie Williams': {'love'},
  'The Rasmus': {'days'}},
 'Xzibit': {'Evanescence': {'new', 'wanted'}}}