### Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import zipfile
import requests
from io import StringIO

import pandas as pd
import numpy as np
from scipy.sparse import dok_matrix, save_npz, load_npz

from sklearn.feature_extraction.text import TfidfTransformer

### Setup 2

These cells below here are for reproducibility of the same directory structure I have. Due to the large size of most of these files, I have only saved the zip files in the repository and have written the code to extract the file contents below.

In [None]:
dirs = [
    "/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/original",
    "/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/generated",
    "/content/drive/MyDrive/cmpe256-project/million_songs_data/GloVe_embeddings"
]

for dir_path in dirs:
    if not os.path.isdir(dir_path):
        os.makedirs(dir_path)
    else:
        print("Directory already exists.")

In [None]:
PATH = "/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/zipped/"
zip_files = os.listdir(PATH)

for file in zip_files:
    with zipfile.ZipFile(PATH + file, 'r') as zip_ref:
        zip_ref.extractall("/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/original")

In [None]:
## File too big to load into memory
# r = requests.get("https://nlp.stanford.edu/data/glove.42B.300d.zip", stream=True)
# with open("/content/drive/MyDrive/cmpe256-project/million_songs_data/glove.42B.300d.zip", "wb") as fp:
#     for chunk in r.iter_content(chunk_size=512):
#         fp.write(chunk)

In [None]:
with zipfile.ZipFile("/content/drive/MyDrive/cmpe256-project/million_songs_data/glove.6B.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/drive/MyDrive/cmpe256-project/million_songs_data/GloVe_embeddings")

In [None]:
# with zipfile.ZipFile("/content/drive/MyDrive/cmpe256-project/million_songs_data/glove.42B.300d.zip", 'r') as zip_ref:
#     zip_ref.extractall("/content/drive/MyDrive/cmpe256-project/million_songs_data/GloVe_embeddings")

## Musix Match Dataset

#### Preprocessing #1

In [None]:
## Formatting of lyrics file from Musix Match
# # - comment, ignore
# %word1,word2,... - list of top words, in popularity order
# TID,MXMID,idx:cnt,idx:cnt,... - track ID from MSD, track ID from musiXmatch,

# then word index : word count (word index starts at 1!)

In [None]:
raw_lyrics = []
with open('/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/original/mxm_dataset_train.txt', 'r') as fp:
    raw_lyrics = fp.readlines()

In [None]:
len(raw_lyrics)

210537

In [None]:
def preprocess(raw_lyrics):
    # Parsing file
    top_words = raw_lyrics[17][1:]
    top_words = top_words.split(',')
    num_songs = len(raw_lyrics) - 18

    # Temporary array holding tuples of both id's for each song
    ids = []

    ## Data structure storing term frequencies for each song using ordering of 5000 top words
    # lyrics = [[0 for _ in range(len(top_words))] for _ in range(num_songs)]

    # Extracting track id's, word id's, and their respective word counts
    for i, l in enumerate(raw_lyrics[18:]):
        song = l.split(',')

        MSD_track_id = song[0]
        musix_match_track_id = song[1]

        ids.append({
            'MSD_track_id': MSD_track_id,
            'Musix_match_track_id': musix_match_track_id
        })

        ## Old sub-optimal approach — storing sparse matrix in dense format takes up too much storage and and RAM
        # cnts = song[2:]
        # for x in cnts:
        #     tup = x.split(':')
        #     word_id, cnt = int(tup[0]) - 1, int(tup[1])
        #     lyrics[i][word_id] = cnt

    # Data structure holding both id's for each song
    song_ids = pd.DataFrame(ids, columns=['MSD_track_id', 'Musix_match_track_id'])

    # return top_words, lyrics, song_ids
    return top_words, song_ids

In [None]:
## top_words, lyrics, song_ids = preprocess(raw_lyrics)
top_words, song_ids = preprocess(raw_lyrics)

In [None]:
# len(lyrics)

In [None]:
len(song_ids)

210519

In [None]:
song_ids.head()

Unnamed: 0,MSD_track_id,Musix_match_track_id
0,TRAAAAV128F421A322,4623710
1,TRAAABD128F429CF47,6477168
2,TRAAAED128E0783FAB,2516445
3,TRAAAEF128F4273421,3759847
4,TRAAAEW128F42930C0,3783760


In [None]:
song_ids.to_csv('/content/drive/MyDrive/cmpe256-project/million_songs_data/generated/song_ids.csv')

In [None]:
# with open('/content/drive/MyDrive/million_songs_data/lyric_term_frequencies.txt', 'w') as fp:
#     for song in lyrics:
#         song = [str(x) for x in song]
#         fp.write(', '.join(song) + "\n")

In [None]:
with open('/content/drive/MyDrive/cmpe256-project/million_songs_data/generated/top_words.txt', 'w') as fp:
    top_words = raw_lyrics[17][1:]
    fp.write(top_words + "\n")

#### Preprocessing For Word Indexes and Counts

In [None]:
# Converting matrix of term counts into a sparse matrix format
def convertToSparse(lyrics):
    dok_mat = dok_matrix((210536,5000), dtype=np.int32)

    for song_num, song in enumerate(lyrics):
        song = song.split(",")

        for cnt in song[2:]:
            cnts = cnt.split(":")
            
            idx = int(cnts[0]) - 1
            num = int(cnts[1])

            dok_mat[song_num, idx] = num

    return dok_mat

In [None]:
lyrics_sparse = convertToSparse(raw_lyrics[18:])

In [None]:
csr_lyrics_sparse = lyrics_sparse.tocsr()

In [None]:
tfidfTransformer = TfidfTransformer()
tfidf_csr_lyrics_sparse = tfidfTransformer.fit_transform(csr_lyrics_sparse)

In [None]:
# tfidf_csr_lyrics_sparse.todense()

In [None]:
# Saving sparse matrix to a file
save_npz('/content/drive/MyDrive/cmpe256-project/million_songs_data/generated/lyrics_csr_tfidf.npz', tfidf_csr_lyrics_sparse)

In [None]:
## Checking that the sparse matrix can be loaded correctly
# s = load_npz('/content/drive/MyDrive/cmpe256-project/million_songs_data/generated/lyrics_csr_tfidf.npz')
# s

#### Dealing with Dense Matrices (Bad approach)

In [None]:
# raw_lyrics_tf = []
# with open('/content/drive/MyDrive/cmpe256-project/million_songs_data/generated/lyric_term_frequencies.txt', 'r') as fp:
#     raw_lyrics_tf = fp.readlines()

In [None]:
## Takes up too much RAM trying to load dense matrix
# lyrics_tf = []
# for line in raw_lyrics_tf:
#     line = line.split(',')
#     line = [int(x) for x in line]
#     lyrics_tf.append(line)

#### Stemming Mappings

In [None]:
# Creating dictionary of words stemmed by Porter Stemmer to map to their unstemmed forms
stem_mappings = dict()
with open('/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/original/stem_mappings.txt', 'r') as fp:
    raw_mappings = fp.readlines()

    for line in raw_mappings:
        tup = line.strip().split("<SEP>")

        stem_mappings[tup[0]] = tup[1]

In [None]:
# stem_mappings

#### GloVe embeddings

In [None]:
# Loading pre-trained GloVe embeddings
glove_embeddings = []
embeddings_filename = '/content/drive/MyDrive/cmpe256-project/million_songs_data/GloVe_embeddings/glove.6B.50d.txt'
# embeddings_filename = '/content/drive/MyDrive/cmpe256-project/million_songs_data/GloVe_embeddings/glove.42B.300d.txt'
with open(embeddings_filename, 'r') as fp:
    glove_embeddings = fp.readlines()
    glove_embeddings = [x.strip() for x in glove_embeddings]

In [None]:
len(glove_embeddings)

400000

In [None]:
# Putting the embeddings into a dictionary
embeddings = dict()
for word in glove_embeddings:
    word = word.split(' ')

    values = [float(x) for x in word[1:]]

    embeddings[word[0]] = np.array(values, dtype=np.float64)

In [None]:
# embeddings

#### Matching Embeddings to Songs

In [None]:
csr_lyrics = load_npz('/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/generated/lyrics_csr_tfidf.npz')
csr_lyrics

<210536x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 16845943 stored elements in Compressed Sparse Row format>

In [None]:
coo_lyrics = csr_lyrics.tocoo()
coo_lyrics

<210536x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 16845943 stored elements in COOrdinate format>

In [None]:
save_npz('/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/generated/lyrics_coo.npz', coo_lyrics)

In [None]:
# coo_lyrics = load_npz('/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/generated/lyrics_coo.npz')

In [None]:
top_words = []
with open('/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/generated/top_words.txt', 'r') as fp:
    top_words = fp.readlines()
    top_words = [word.strip() for word in top_words[0].split(',')]

In [None]:
# Creating a set of words that are not found in GloVe embeddings
import re

excluded_words = set()

found = 0

for word in top_words:
    original = stem_mappings[word]

    # For words such as jumpin', I manually add back in the 'g'
    if original[-1] == '\'':
        original = original[:-1] + "g"

    if original not in embeddings.keys():
        excluded_words.add(word)

print("Words not found in GloVe: ", len(excluded_words))
print(excluded_words)

Words not found in GloVe:  206
{'lalala', 'qued', 'cétait', 'vuelva', 'nen', 'aqu', 'croir', 'i´m', 'kuinka', 'står', 'aún', 'tão', 'olhar', 'tuyo', 'ahí', 'dimmi', 'puoi', 'sinä', 'ninguém', 'wär', '3x', 'you’r', 'estou', 'vaikka', 'sagt', 'ingent', 'prend', 'qu', 'kaiken', 'também', 'it´', 'gefühl', 'querert', 'diga', 'vielä', 'kannst', 'x4', 'deinen', 'då', 'it’', 'motherfuckin', 'siehst', 'ficar', 'sehen', 'jen', 'deje', 'blir', 'tenert', 'javai', 'sonho', 'pleur', 'ny', 'rocknrol', 'keinen', 'bleib', 'estc', 'kaikki', 'pierdo', 'acabar', 'niemal', 'youâ\x80\x99r', 'niin', 'fore', 'jaurai', 'joue', 'kvar', 'där', 'gehen', 'cos', 'perd', 'cansado', 'jsui', 'motivo', 'doigt', 'bout', 'lon', 'hogi', 'mnie', 'lhe', 'coisa', 'lil', '4x', 'decirt', 'c\x9cur', 'outra', 'sest', 'allí', 'sinto', 'coraz', 'i’m', 'einfach', 'cè', '‘caus', 'tellement', 'willst', '¿qué', 'piu', 'già', 'perch', 'estribillo', 'x2', 'nessuno', 'genau', 'iâ\x80\x99m', 'weißt', 'minä', 'difícil', 'vielleicht', 'you´

In [None]:
# Showing a subset of percentages of missing words (words from top words that don't show up in GloVe embeddings)
counter = 0
counter2 = 0
prev = 0

for d, r, c in zip(coo_lyrics.data, coo_lyrics.row, coo_lyrics.col):

    if r == 10:
        break

    word = top_words[c]

    # print(d, " ", r, " ", c, " ", word, " ", stem_mappings[word])

    original = stem_mappings[word]

    if original[-1] == '\'':
        original = original[:-1] + "g"

    if r != prev:
        print(round(counter/counter2,2)*100, " % words missed")
        prev = r
        if word in excluded_words:
            counter = 1
            counter2 = 1
        else:
            counter = 0
            counter2 = 1
    else:
        if word in excluded_words:
            counter += 1
        counter2 += 1
 
    # print('*'*10)

0.0  % words missed
1.0  % words missed
2.0  % words missed
3.0  % words missed
3.0  % words missed
1.0  % words missed
5.0  % words missed
1.0  % words missed
1.0  % words missed


In [None]:
ids_df = pd.read_csv('/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/generated/song_ids.csv', index_col=0)
ids_df

Unnamed: 0,MSD_track_id,Musix_match_track_id
0,TRAAAAV128F421A322,4623710
1,TRAAABD128F429CF47,6477168
2,TRAAAED128E0783FAB,2516445
3,TRAAAEF128F4273421,3759847
4,TRAAAEW128F42930C0,3783760
...,...,...
210514,TRZZZWS128F429CF87,3080645
210515,TRZZZXA128F428ED56,2344272
210516,TRZZZXV128F4289747,1417347
210517,TRZZZYV128F92E996D,6849828


In [None]:
for i, id in enumerate(ids_df["MSD_track_id"].values):
    if id == "TRMAXFX128F1462371":
        print(i)
        break

97971


In [None]:
emb_columns = list(ids_df.columns)

for i in range(50):
    emb_columns.append("d" + str(i))

print(emb_columns)

['MSD_track_id', 'Musix_match_track_id', 'd0', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10', 'd11', 'd12', 'd13', 'd14', 'd15', 'd16', 'd17', 'd18', 'd19', 'd20', 'd21', 'd22', 'd23', 'd24', 'd25', 'd26', 'd27', 'd28', 'd29', 'd30', 'd31', 'd32', 'd33', 'd34', 'd35', 'd36', 'd37', 'd38', 'd39', 'd40', 'd41', 'd42', 'd43', 'd44', 'd45', 'd46', 'd47', 'd48', 'd49']


In [None]:
# Creating final song embeddings by averaging all embeddings for a song and weighting each word embedding with their TF-IDF values
import csv

fp = open("/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/generated/song_embeddings.csv", "w")

writer = csv.DictWriter(fp, fieldnames=emb_columns)
writer.writeheader()

tmp = []
prev = 0

for d, r, c in zip(coo_lyrics.data, coo_lyrics.row, coo_lyrics.col):
    
    word = top_words[c]

    # print(d, " ", r, " ", c)#, " ", word, " ", stem_mappings[word])

    original = stem_mappings[word]

    if original[-1] == '\'':
        original = original[:-1] + "g"

    if r != prev:
        ids = ids_df.iloc[prev].values

        if len(tmp) > 0:
            avg = np.mean(tmp, axis=0)
        else:
            avg = np.zeros((50,), dtype=np.float64)

        row = np.append(ids, avg)

        tmp_row = {}

        for i, c in enumerate(emb_columns):
            tmp_row[c] = row[i]

        writer.writerow(tmp_row)

        tmp = []
        prev = r

    if word not in excluded_words:
        tmp.append(d * embeddings[original])

    # print('*'*10)

fp.close()