In [1]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict
import scipy

In [2]:
# Load in user data
user_df = pd.read_csv('../data/user-df-1-22-2024.csv')

In [3]:
artists = user_df['artist_name'].unique()

In [4]:
with open('../data/artist_location_codes.json', 'r') as file:
    artists_info = json.load(file)

Before we can clean up the user data, we must first find out what artists we are going to keep and use for prediction. Since we're keeping artists whose country we know, we must first acquire that data.

In [5]:
# Extracts all location data from an artist data API call
def get_artist_locations(artist):
    artist_locations = {}
    if 'country' in artist:
        artist_locations['country'] = artist['country']
    else:
        artist_locations['country'] = None

    if 'area' in artist and artist['area'] is not None:
        if type(artist['area']) is str:
            artist_locations['area'] = artist['area']
            artist_locations['iso-3166-1-codes'] = None
            artist_locations['area-name'] = None
            artist_locations['area-sort-name'] = None
            artist_locations['iso-3166-2-codes'] = None
        else:
            if 'iso-3166-1-codes' in artist['area']:
                artist_locations['iso-3166-1-codes'] = artist['area']['iso-3166-1-codes'][0]
            else:
                artist_locations['iso-3166-1-codes'] = None
                
            if 'iso-3166-2-codes' in artist['area']:
                artist_locations['iso-3166-2-codes'] = artist['area']['iso-3166-2-codes'][0][:2]
            else:
                artist_locations['iso-3166-2-codes'] = None

            if 'name' in artist['area']:
                artist_locations['area-name'] = artist['area']['name']
            else:
                artist_locations['area-name'] = None

            if 'sort-name' in artist['area']:
                artist_locations['area-sort-name'] = artist['area']['sort-name']
            else:
                artist_locations['area-sort-name'] = None
            
            artist_locations['area'] = None
    else:
        artist_locations['area'] = None
        artist_locations['iso-3166-1-codes'] = None
        artist_locations['area-name'] = None
        artist_locations['area-sort-name'] = None
        artist_locations['iso-3166-2-codes'] = None
    
    return artist_locations

In [6]:
artist_locations = {}
for artist in artists:
    if artist in artists_info:
        artist_info = artists_info[artist]
        artist_locations[artist] = get_artist_locations(artist_info)

In [7]:
# Artist location data in all forms available via API
artist_locations_df = pd.DataFrame(artist_locations).transpose()
artist_locations_df

Unnamed: 0,country,area,iso-3166-1-codes,area-name,area-sort-name,iso-3166-2-codes
Jasmine Thompson,,,,,,
Eminem,,,,,,
Watsky,,,,,,
Linkin Park,,,,,,
twenty one pilots,,,,,,
...,...,...,...,...,...,...
Ray Conniff and His Orchestra,,,,,,
Dorit Chrysler,,,,,,
Skintone,,,,,,
Royale,,,,,,


In [8]:
# Dict mapping artist location data to a country code
area_to_code = {}
for artist in artist_locations_df.iterrows():
    if 'iso-3166-1-codes' in artist[1] and artist[1]['iso-3166-1-codes'] is not None:
        for field in ['country', 'area-name', 'area-sort-name', 'area']:
            if field in artist[1] and artist[1][field] is not None:
                area_to_code[artist[1][field]] = artist[1]['iso-3166-1-codes']
                
    if 'iso-3166-2-codes' in artist[1] and artist[1]['iso-3166-2-codes'] is not None:
        for field in ['country', 'area-name', 'area-sort-name', 'area']:
            if field in artist[1] and artist[1][field] is not None:
                area_to_code[artist[1][field]] = artist[1]['iso-3166-2-codes']

In [9]:
def artist_to_country(artist, mapping):
    if 'iso-3166-1-codes' in artist:
        return artist['iso-3166-1-codes']
    elif 'iso-3166-2-codes' in artist:
        return artist['iso-3166-2-codes']
    else:
        for field in ['country', 'area-name', 'area-sort-name', 'area']:
            if field in artist and field in mapping:
                return mapping[field]

In [10]:
# Mapping of artists to their country codes
artist_location_codes = {}
for artist in artist_locations_df.iterrows():
    artist_code = artist_to_country(artist[1], area_to_code)
    if artist_code is not None:
        artist_location_codes[artist[0]] = artist_code

In [11]:
with open('../data/artist_location_codes.json', 'w') as file:
    json.dump(artist_location_codes, file)

Now that we've found all the artists in our database who have countries of origin listed, we can clean up user_df and create our features.

In [12]:
artist

('DJ Stickle',
 country             None
 area                None
 iso-3166-1-codes    None
 area-name           None
 area-sort-name      None
 iso-3166-2-codes    None
 Name: DJ Stickle, dtype: object)

In [13]:
artists_to_keep = list(artist_location_codes.keys())

In [14]:
kept_artists_df = user_df[user_df['artist_name'].isin(artists_to_keep)]

In [15]:
kept_artists_df

Unnamed: 0.1,Unnamed: 0,user,artist_name,play_count,artist_url


In [16]:
# Convert to sparse matrix so it can actually be stored on disk
user_ids = kept_artists_df['user'].unique()
artist_names = kept_artists_df['artist_name'].unique()

user_to_index = {user: i for i, user in enumerate(user_ids)}
artist_to_index = {artist: j for j, artist in enumerate(artist_names)}

index_to_user = {i: user for user, i in user_to_index.items()}
index_to_artist = {j: artist for artist, j in artist_to_index.items()}

# Convert DataFrame rows to (row index, column index, value) tuples
rows = kept_artists_df['user'].map(user_to_index)
cols = kept_artists_df['artist_name'].map(artist_to_index)
values = kept_artists_df['play_count']

sparse_matrix = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(len(user_ids), len(artist_names)))

In [17]:
scipy.sparse.save_npz('../data/user_data.npz', sparse_matrix)

In [18]:
def get_user_play_ct(user):  # The user ID you want to query
    user_index = user_to_index[user]

    # To get all play counts for this user (returns a sparse matrix row slice)
    user_play_counts = sparse_matrix.getrow(user_index)

    # To convert to a dense format 
    user_play_counts_dense = user_play_counts.todense()
    
    return pd.Series(user_play_counts_dense.A1, artists_to_keep)

In [19]:
with open('../data/filtered_user_df.csv', 'w', encoding='utf-8') as file:
    file.write(kept_artists_df.to_csv())