## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import zipfile
import requests
from io import StringIO

import pandas as pd
import numpy as np

In [None]:
with zipfile.ZipFile("/content/drive/MyDrive/cmpe256-project/data/msd_tagtraum_cd1.cls.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/drive/MyDrive/cmpe256-project/data/genres")

## Reading in File

In [None]:
raw_genres = []
with open('/content/drive/MyDrive/cmpe256-project/data/genres/msd_tagtraum_cd1.cls', 'r') as fp:
    raw_genres = fp.readlines()
    raw_genres = raw_genres[7:] # Ignoring commented lines
len(raw_genres)

133676

## Parsing

In [None]:
genres = [row.strip().split('\t') for row in raw_genres]

In [None]:
genres_dict = dict()

for row in genres:
    song_id = row[0]
    genres = row[1:]

    genres_dict[song_id] = genres

In [None]:
genres_dict

{'TRAAAAK128F9318786': ['Pop_Rock'],
 'TRAAAAW128F429D538': ['Rap'],
 'TRAAABD128F429CF47': ['Pop_Rock'],
 'TRAAAED128E0783FAB': ['Jazz', 'Vocal'],
 'TRAAAEF128F4273421': ['Pop_Rock'],
 'TRAAAEM128F93347B9': ['Electronic'],
 'TRAAAFD128F92F423A': ['Pop_Rock'],
 'TRAAAGR128F425B14B': ['Pop_Rock'],
 'TRAAAHZ128E0799171': ['Rap'],
 'TRAAAIR128F1480971': ['Pop_Rock', 'RnB'],
 'TRAAANK128F428B515': ['Pop_Rock', 'Electronic'],
 'TRAAARJ128F9320760': ['Pop_Rock'],
 'TRAABFH128F92C812E': ['Pop_Rock'],
 'TRAABHO12903D08576': ['Jazz'],
 'TRAABIG128F9356C56': ['Pop_Rock'],
 'TRAABJS128F9325C99': ['Pop_Rock'],
 'TRAABLR128F423B7E3': ['Pop_Rock'],
 'TRAABNV128F425CEE1': ['New Age', 'Jazz'],
 'TRAABOG128F42955B1': ['Pop_Rock'],
 'TRAABWH128F427ABE8': ['Jazz', 'Pop_Rock'],
 'TRAABWX128F1464374': ['Electronic', 'Pop_Rock'],
 'TRAACER128F4290F96': ['Pop_Rock', 'Jazz'],
 'TRAACEU128F92C3B82': ['Jazz'],
 'TRAACJD128E078926C': ['Pop_Rock'],
 'TRAACLG128F4276511': ['Electronic'],
 'TRAACMJ128F930C704': ['P

## One-Hot Encoding Genres

In [None]:
genres_list = set()
for tup in genres_dict.values():
    for ele in tup:
        if ele not in genres_list:
            genres_list.add(ele)

genres_list = list(genres_list)

print(len(genres_list))
genres_list

13


['Vocal',
 'Rap',
 'Jazz',
 'New Age',
 'Reggae',
 'Latin',
 'Pop_Rock',
 'Electronic',
 'International',
 'Country',
 'Blues',
 'Folk',
 'RnB']

In [None]:
import csv

fp = open("/content/drive/MyDrive/cmpe256-project/data/genres/song_genre_one_hot_encodings.csv", "w")

columns = ["track_id"]
columns.extend(genres_list)

writer = csv.DictWriter(fp, fieldnames=columns)
writer.writeheader()

for song_id, song_genres in zip(genres_dict.keys(), genres_dict.values()):
    tmp = {}

    tmp["track_id"] = song_id

    for index, genre in enumerate(genres_list):
        if genre in song_genres:
            tmp[genre] = 1
        else:
            tmp[genre] = 0

    writer.writerow(tmp)
    
fp.close()

## Encoding Genres

In [None]:
import csv

fp = open("/content/drive/MyDrive/cmpe256-project/data/genres/song_genre_encodings.csv", "w")

columns = ["track_id"]
columns.extend(genres_list)

writer = csv.DictWriter(fp, fieldnames=columns)
writer.writeheader()

for song_id, song_genres in zip(genres_dict.keys(), genres_dict.values()):
    tmp = {}

    tmp["track_id"] = song_id

    for index, genre in enumerate(genres_list):
        if genre in song_genres:
            tmp[genre] = 1
        else:
            tmp[genre] = 0

    writer.writerow(tmp)
    
fp.close()