In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import dok_matrix, save_npz, load_npz

from sklearn.feature_extraction.text import TfidfTransformer

In [37]:
dirs = [
    "/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/original",
    "/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/generated",
    "/content/drive/MyDrive/cmpe256-project/million_songs_data/GloVe_embeddings"
]

for dir_path in dirs:
    if not os.path.isdir(dir_path):
        os.makedirs(dir_path)
    else:
        print("Directory already exists.")

Directory already exists.
Directory already exists.
Directory already exists.


In [36]:
import zipfile

PATH = "/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/zipped/"
zip_files = os.listdir(PATH)

for file in zip_files:
    with zipfile.ZipFile(PATH + file, 'r') as zip_ref:
        zip_ref.extractall("/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/original")

In [50]:
import requests
from io import StringIO

r = requests.get("https://nlp.stanford.edu/data/glove.6B.zip", stream=True)
with open("/content/drive/MyDrive/cmpe256-project/million_songs_data/glove.6B.zip", "wb") as fp:
    for chunk in r.iter_content(chunk_size=128):
        fp.write(chunk)

In [51]:
with zipfile.ZipFile("/content/drive/MyDrive/cmpe256-project/million_songs_data/glove.6B.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/drive/MyDrive/cmpe256-project/million_songs_data/GloVe_embeddings")

## Musix Match Dataset

#### Preprocessing #1

In [None]:
## Formatting of lyrics
# # - comment, ignore
# %word1,word2,... - list of top words, in popularity order
# TID,MXMID,idx:cnt,idx:cnt,... - track ID from MSD, track ID from musiXmatch,

# then word index : word count (word index starts at 1!)

In [None]:
raw_lyrics = []
with open('/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/original/mxm_dataset_train.txt', 'r') as fp:
    raw_lyrics = fp.readlines()

In [None]:
len(raw_lyrics)

210537

In [None]:
def preprocess(raw_lyrics):
    # Getting top words
    top_words = raw_lyrics[17][1:]
    top_words = top_words.split(',')
    num_songs = len(raw_lyrics) - 18

    # Temporary array holding both id's of each song
    ids = []

    ## Data structure storing term frequencies for each song using ordering of 5000 top words
    # lyrics = [[0 for _ in range(len(top_words))] for _ in range(num_songs)]

    # Parsing track id's, word id's, and their respective word counts
    for i, l in enumerate(raw_lyrics[18:]):
        song = l.split(',')

        MSD_track_id = song[0]
        musix_match_track_id = song[1]
        # cnts = song[2:]

        ids.append({
            'MSD_track_id': MSD_track_id,
            'Musix_match_track_id': musix_match_track_id
        })

        ## Old sub-optimal approach — storing sparse matrix in dense format not optimal
        # for x in cnts:
        #     tup = x.split(':')
        #     word_id, cnt = int(tup[0]) - 1, int(tup[1])
        #     lyrics[i][word_id] = cnt

    # Data structure holding both id's for each song
    song_ids = pd.DataFrame(ids, columns=['MSD_track_id', 'Musix_match_track_id'])

    # return top_words, lyrics, song_ids
    return top_words, song_ids

In [None]:
## top_words, lyrics, song_ids = preprocess(raw_lyrics)
top_words, song_ids = preprocess(raw_lyrics)

In [None]:
# len(lyrics)

In [None]:
len(song_ids)

210519

In [None]:
song_ids.head()

Unnamed: 0,MSD_track_id,Musix_match_track_id
0,TRAAAAV128F421A322,4623710
1,TRAAABD128F429CF47,6477168
2,TRAAAED128E0783FAB,2516445
3,TRAAAEF128F4273421,3759847
4,TRAAAEW128F42930C0,3783760


In [None]:
song_ids.to_csv('/content/drive/MyDrive/cmpe256-project/million_songs_data/generated/song_ids.csv')

In [None]:
# with open('/content/drive/MyDrive/million_songs_data/lyric_term_frequencies.txt', 'w') as fp:
#     for song in lyrics:
#         song = [str(x) for x in song]
#         fp.write(', '.join(song) + "\n")

In [None]:
with open('/content/drive/MyDrive/cmpe256-project/million_songs_data/generated/top_words.txt', 'w') as fp:
    top_words = raw_lyrics[17][1:]
    fp.write(top_words + "\n")

#### Preprocessing For Word Indexes and Counts

In [None]:
def convertToSparse(lyrics):
    dok_mat = dok_matrix((210536,5000), dtype=np.int32)

    for song_num, song in enumerate(lyrics):
        song = song.split(",")

        for cnt in song[2:]:
            cnts = cnt.split(":")
            
            idx = int(cnts[0]) - 1
            num = int(cnts[1])

            dok_mat[song_num, idx] = num

    return dok_mat

In [None]:
lyrics_sparse = convertToSparse(raw_lyrics[18:])

In [None]:
csr_lyrics_sparse = lyrics_sparse.tocsr()

In [None]:
tfidfTransformer = TfidfTransformer()
tfidf_csr_lyrics_sparse = tfidfTransformer.fit_transform(csr_lyrics_sparse)

In [None]:
# tfidf_csr_lyrics_sparse.todense()

In [None]:
save_npz('/content/drive/MyDrive/cmpe256-project/million_songs_data/generated/lyrics_csr_tfidf.npz', tfidf_csr_lyrics_sparse)

In [None]:
# s = load_npz('/content/drive/MyDrive/cmpe256-project/million_songs_data/generated/lyrics_csr_tfidf.npz')
# s

#### Dealing with Dense Matrices (Bad approach)

In [None]:
# raw_lyrics_tf = []
# with open('/content/drive/MyDrive/cmpe256-project/million_songs_data/generated/lyric_term_frequencies.txt', 'r') as fp:
#     raw_lyrics_tf = fp.readlines()

In [None]:
# lyrics_tf = []
# for line in raw_lyrics_tf:
#     line = line.split(',')
#     line = [int(x) for x in line]
#     lyrics_tf.append(line)

#### Stemming Mappings

In [None]:
stem_mappings = dict()
with open('/content/drive/MyDrive/cmpe256-project/million_songs_data/musix_match_data/original/stem_mappings.txt', 'r') as fp: # Porter stemming
    raw_mappings = fp.readlines()

    for line in raw_mappings:
        tup = line.strip().split("<SEP>")

        stem_mappings[tup[1]] = tup[0]

In [None]:
# stem_mappings

#### GloVe embeddings

In [None]:
glove_embeddings = []
with open('/content/drive/MyDrive/cmpe256-project/million_songs_data/GloVe_embeddings/glove.6B.50d.txt', 'r') as fp:
    glove_embeddings = fp.readlines()
    glove_embeddings = [x.strip() for x in glove_embeddings]

In [None]:
len(glove_embeddings)

400000

In [None]:
embeddings = dict()
for word in glove_embeddings:
    word = word.split(' ')

    values = [float(x) for x in word[1:]]

    embeddings[word[0]] = values

In [None]:
# embeddings