In [2]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                   
import logging

sagemaker_config_logger = logging.getLogger("sagemaker.config")
sagemaker_config_logger.setLevel(logging.WARNING)

# Define IAM role
role = get_execution_role()
print("execution role ARN: ", role)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
execution role ARN:  arn:aws:iam::629515838455:role/service-role/AmazonSageMaker-ExecutionRole-20231125T142747


##

In [3]:

# use the default sagemaker s3 bucket to store processed data
# here we figure out what that default bucket name is 
sagemaker_session = sagemaker.Session()
bucket_name = sagemaker_session.default_bucket()
print(bucket_name)  
# bucket name format: "sagemaker-{region}-{aws_account_id}"
%store bucket_name

sagemaker-ap-northeast-2-629515838455
Stored 'bucket_name' (str)


In [4]:
import pandas as pd 
merged_data_path = 's3://{}/{}'.format(bucket_name, "data/merged_dataset.csv")
user_artist_table_path = 's3://{}/{}'.format(bucket_name, "data/user-artist-table.csv")
vector_out_path = 's3://{}/{}'.format(bucket_name, "data/user-artist-playcount-table.csv")

# music_listening_info = pd.read_csv(merged_data_path)
 
# feature_cols = ['user_id', 'playcount', 'artist']
# music_listening_info = music_listening_info[feature_cols]

# music_listening_info.sample(n=50000)
# music_listening_info.to_csv(user_artist_table_path)

df = pd.read_csv(user_artist_table_path)

In [5]:
from tqdm.auto import tqdm

df = df.sample(n=5000)
user_set = df['user_id'].unique()
artist_set = df['artist'].unique()

user_playcount_list = []
for user in tqdm(user_set, desc = "Get playcount values for each user: "):
  user_df = df[df['user_id'] == user].reset_index(drop=False)
  # print(user_df)

  user_playcount = [0.0] * len(artist_set)
  
  for i, artist in enumerate(user_df['artist']):
    artist_index = list(artist_set).index(artist)
    # print(artist_index)
    # print(user_df.loc[i, ['playcount']].values[0])
    user_playcount[artist_index] = user_df.loc[i, ['playcount']].values[0]

  # print(user_playcount)
  user_playcount_list.append(user_playcount)


    
user_item_vectors_df = pd.DataFrame({'user_id' : user_set, 
                                  'vector' : user_playcount_list
                                  })


user_item_vectors_df
user_item_vectors_df.to_csv(vector_out_path)

%store user_set
%store artist_set
%store user_playcount_list

Get playcount values for each user:   0%|          | 0/4959 [00:00<?, ?it/s]

Stored 'user_set' (ndarray)
Stored 'artist_set' (ndarray)
Stored 'user_playcount_list' (list)


In [None]:
%store -r user_set
%store -r artist_set
%store -r user_playcount_list

print(user_set)
print(artist_set)
print(user_playcount_list)

In [None]:
user_item_matrix = df.pivot_table(index='user_id', columns='artist', values='playcount', fill_value=0)
user_item_matrix

In [10]:
# 리스트를 파싱하여 NumPy 배열로 변환
playcount_vectors = np.empty((0, len(user_playcount_list[0])), dtype=int)
for vector in tqdm(user_playcount_list):
    playcount_vectors = np.vstack((playcount_vectors, vector))

  0%|          | 0/4959 [00:00<?, ?it/s]

In [11]:
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# 희소 행렬 생성
sparse_user_item = csr_matrix(playcount_vectors)
# 유사도 계산
user_similarity = cosine_similarity(sparse_user_item)


In [12]:

# 추천 함수
def recommend_artists_for_user(user_id, n_recommendations=5):
    if user_id not in list(user_set):
        return "User not found"

    user_index = list(user_set).index(user_id)
    similar_users = user_similarity[user_index].argsort()[-50:][::-1]

    # 현재 사용자가 평가한 아이템
    user_items_rated = np.nonzero(playcount_vectors[user_index])[0]

    # 유사한 사용자들이 좋아하는 아이템 추천
    recommended_items = {}
    for similar_user_index in similar_users:
        similar_user_id = user_item_vectors_df['user_id'][similar_user_index]
        similar_user_items = playcount_vectors[similar_user_index, :]

        not_rated_items = list(set(np.nonzero(similar_user_items)[0]) - set(user_items_rated))

        if not_rated_items is not None:
          for item in not_rated_items:
              recommended_items[item] = recommended_items.get(item, 0) + similar_user_items[item]

    # 가장 높은 평점을 받은 아이템 추천
    recommended_items_sorted = sorted(recommended_items.items(), key=lambda x: x[1], reverse=True)
    
    # 각 튜플의 첫 번째 요소 추출
    recommended_items = [tup[0] for tup in recommended_items_sorted] 

    final_recommended_items = []

    for recommended_item in recommended_items:
      final_recommended_items.append(artist_set[recommended_item])

    return final_recommended_items[:n_recommendations]


In [13]:

# 특정 사용자에 대한 추천
user_id = list(user_set)[5]

recommendations = recommend_artists_for_user(user_id, 5)
print(recommendations)

['Dr. Dre', 'Hot Chip', 'The New Pornographers', 'Tim Armstrong', 'Soldiers of Jah Army']


In [17]:
%pip install spotipy

Collecting spotipy
  Downloading spotipy-2.23.0-py3-none-any.whl (29 kB)
Collecting redis>=3.5.3 (from spotipy)
  Obtaining dependency information for redis>=3.5.3 from https://files.pythonhosted.org/packages/0b/34/a01250ac1fc9bf9161e07956d2d580413106ce02d5591470130a25c599e3/redis-5.0.1-py3-none-any.whl.metadata
  Downloading redis-5.0.1-py3-none-any.whl.metadata (8.9 kB)
Collecting async-timeout>=4.0.2 (from redis>=3.5.3->spotipy)
  Obtaining dependency information for async-timeout>=4.0.2 from https://files.pythonhosted.org/packages/a7/fa/e01228c2938de91d47b307831c62ab9e4001e747789d0b05baf779a6488c/async_timeout-4.0.3-py3-none-any.whl.metadata
  Using cached async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Downloading redis-5.0.1-py3-none-any.whl (250 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.3/250.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hUsing cached async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
[33mDEPRECATION: pyodbc 4.0.0-

In [22]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pprint
 
cid = '229a64fea61d4732bda993b5a16578be'
secret = '#'
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
 
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [30]:
for recommend in recommendations:
    print(f"\n[{recommend}]\n")
    search_result = sp.search(recommend, limit=1, type="artist")
#     pprint.pprint(search_result)
    
    artist_uri = search_result["artists"]["items"][0]["id"]
    
    album_result = sp.artist_albums(artist_uri, limit=2)
    items = album_result["items"]
    pprint.pprint(items[0]['name'])
    pprint.pprint(items[1]['name'])

Couldn't read cache at: .cache
Couldn't write token to cache at: .cache
Couldn't read cache at: .cache



[Dr. Dre]



Couldn't write token to cache at: .cache
Couldn't read cache at: .cache
Couldn't write token to cache at: .cache
Couldn't read cache at: .cache


'Compton'
'Dr. Dre & Friends'

[Hot Chip]



Couldn't write token to cache at: .cache
Couldn't read cache at: .cache
Couldn't write token to cache at: .cache
Couldn't read cache at: .cache


'Freakout/Release'
'Late Night Tales: Hot Chip (LNT Mix)'

[The New Pornographers]



Couldn't write token to cache at: .cache
Couldn't read cache at: .cache
Couldn't write token to cache at: .cache
Couldn't read cache at: .cache


'Continue as a Guest'
'In The Morse Code Of Brake Lights'

[Tim Armstrong]



Couldn't write token to cache at: .cache
Couldn't read cache at: .cache
Couldn't write token to cache at: .cache
Couldn't read cache at: .cache


'A Poets Life'
'The Essential Cypress Hill'

[Soldiers of Jah Army]



Couldn't write token to cache at: .cache


'Beauty In The Silence'
'Poetry In Motion'
