# Universal ID Generator

In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_parquet("/scratch/work/courses/DSGA1004-2021/listenbrainz/tracks_train.parquet")
train['train_test'] = 'train'

test = pd.read_parquet("/scratch/work/courses/DSGA1004-2021/listenbrainz/tracks_test.parquet")
test['train_test'] = 'test'

tracks = pd.concat([train, test])

tracks['track_name'] = tracks['track_name'].apply(str.lower)
tracks['artist_name'] = tracks['artist_name'].apply(str.lower)

In [3]:
tracks_filtered = tracks[~tracks['recording_mbid'].isna()]
tracks_nans = tracks[tracks['recording_mbid'].isna()]

In [None]:
imputed = tracks_nans.drop(columns=['recording_mbid']).merge(
    tracks_filtered.drop(columns=['recording_msid','train_test']).drop_duplicates(),
    how='left',
    on=['artist_name','track_name']
)

imputed_matched = imputed[~imputed['recording_mbid'].isna()]
imputed_not_matched = imputed[imputed['recording_mbid'].isna()]

In [None]:
start_id = 0

tracks_filtered_mbids = tracks_filtered[['recording_mbid']].drop_duplicates()
tracks_filtered_mbids['universal_id'] = np.arange(len(tracks_filtered_mbids))
start_id += len(tracks_filtered_mbids)

tracks_filtered = tracks_filtered.merge(tracks_filtered_mbids, how='left', on='recording_mbid')

tracks_filtered_counts = tracks_filtered.groupby(['train_test','recording_msid']).agg({'recording_mbid': 'count'}).reset_index()

tracks_filtered_counts['recording_mbid'] = 1/tracks_filtered_counts['recording_mbid']
tracks_filtered_counts = tracks_filtered_counts.rename(columns={'recording_mbid': 'num_listens'})

tracks_filtered = tracks_filtered.merge(tracks_filtered_counts, how='left', on=['train_test','recording_msid'])

In [None]:
imputed_matched_counts = imputed_matched.groupby(['train_test','recording_msid']).agg({'recording_mbid': 'count'}).reset_index()

imputed_matched_counts['recording_mbid'] = 1/imputed_matched_counts['recording_mbid']
imputed_matched_counts = imputed_matched_counts.rename(columns={'recording_mbid': 'num_listens'})

imputed_matched = imputed_matched.merge(imputed_matched_counts, how='left', on=['train_test', 'recording_msid'])
imputed_matched = imputed_matched.merge(tracks_filtered_mbids, how='left', on='recording_mbid')

In [None]:
imputed_not_matched_ids = imputed_not_matched[['track_name','artist_name']].drop_duplicates()
imputed_not_matched_ids['universal_id'] = np.arange(len(imputed_not_matched_ids)) + start_id

imputed_not_matched = imputed_not_matched.merge(imputed_not_matched_ids, how='left', on=['artist_name', 'track_name'])
imputed_not_matched['num_listens'] = 1

In [None]:
cols = ['train_test', 'recording_msid', 'universal_id', 'num_listens']

cleaned_mapping = pd.concat([
    tracks_filtered[cols],
    imputed_matched[cols],
    imputed_not_matched[cols]
])

In [None]:
assert len(cleaned_mapping[['train_test','recording_msid']].drop_duplicates()) == len(tracks[['train_test','recording_msid']].drop_duplicates())

listen_totals = cleaned_mapping.groupby(['train_test','recording_msid']).agg({'num_listens': 'sum'}).reset_index()
assert np.max(abs(listen_totals['num_listens'] - 1)) < 1e-6

assert len(cleaned_mapping[cleaned_mapping['train_test'] == 'train']['recording_msid'].unique()) == len(train)
assert len(cleaned_mapping[cleaned_mapping['train_test'] == 'test']['recording_msid'].unique()) == len(test)

In [None]:
cleaned_mapping[cleaned_mapping['train_test'] == 'train'].drop(columns='train_test').to_parquet(f'tracks_train.parquet')
cleaned_mapping[cleaned_mapping['train_test'] == 'test'].drop(columns='train_test').to_parquet(f'tracks_test.parquet')