In [12]:
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer

In [13]:
base_dir = '../../../Last.FM'

In [14]:
df_artists = pd.read_csv(os.path.join(base_dir, 'artists.dat'), sep='\t')
df_tags = pd.read_csv(os.path.join(base_dir, 'tags.dat'), sep='\t', encoding = 'ISO-8859-1')
df_user_artists = pd.read_csv(os.path.join(base_dir, 'user_artists.dat'), sep='\t') # sorted
df_user_taggedartists = pd.read_csv(os.path.join(base_dir, 'user_taggedartists.dat'), sep='\t')
df_user_friends = pd.read_csv(os.path.join(base_dir, 'user_friends.dat'), sep='\t')

In [15]:
print(len(df_user_artists['userID'].unique()))
print(len(df_user_artists['artistID'].unique()))
print(len(df_user_taggedartists['userID'].unique()))
print(len(df_user_taggedartists['artistID'].unique()))

1892
17632
1892
12523


### USER

In [16]:
most_review = df_user_taggedartists.groupby('userID').size().sort_values(ascending=False)
most_rated = df_user_taggedartists.groupby('artistID').size().sort_values(ascending=False)
top_1000_user = most_review.index[:1000].values

In [17]:
# Standard
final = df_user_artists[pd.DataFrame(df_user_artists['userID'].tolist()).isin(top_1000_user).any(1)].reset_index(drop=True)

# More User
# final = df_user_artists

In [18]:
mapping_user = pd.DataFrame(list(final['userID'].unique())).reset_index().set_index(0).to_dict()['index']
mapping_artist = pd.DataFrame(list(final['artistID'].unique())).reset_index().set_index(0).to_dict()['index']

In [19]:
final['userID'] = final['userID'].map(lambda x: mapping_user[x])
final['artistID'] = final['artistID'].map(lambda x: mapping_artist[x])

In [20]:
# Add rank for index, set method 'first' if it has same value
final['rank'] = final.groupby('userID')['weight'].rank(method='first', ascending=False).astype('int')

In [21]:
final_user = final.pivot_table(values='artistID', index='rank', columns='userID').fillna(-1).astype('int').to_dict('list')

In [22]:
# remove -1
for key, values in final_user.items():
    cnt = 0
    for value in values:
        if value == -1:
            cnt += 1
    if cnt > 0:
        final_user[key] = final_user[key][:-cnt]

In [23]:
# Export to npy
np.save('data/lastfm_user_json_more.npy', final_user)

### ITEM

In [24]:
# get used tags with index == value
used_tags = df_user_taggedartists['tagID'].value_counts().keys().to_series().rename('used_tags')

In [25]:
# Replace all punctuation with space then split

new_df_tags = df_tags.join(used_tags, on='tagID').dropna().drop(columns=['used_tags'])

new_df_tags['tagValue'] = new_df_tags['tagValue'].str.lower().str.replace(r'[^\w\s]|_', ' ').str.split()

In [26]:
# if digit, it should be year
# if alpha, contains vowel but not all, length between 3 and 11
# not digit-alpha

def containsVowel(x):
    return 'a' in x or 'i' in x or 'u' in x or 'e' in x or 'o' in x

def isAllVowel(x):
    for i in x:
        if i not in ['a', 'i', 'u', 'e', 'o']:
            return False
    return True

temp = {'tagID': [], 'tagValue': []}
for row, content in new_df_tags.iterrows():
    for i in content['tagValue']:
        if i != '' and (i.isdigit() or i.isalpha()):
            if len(i) > 1:
                if i[-1] == 's' and i[-2] != 's':
                    i = i[:-1]                
            if (i.isdigit() and (i[:2] == '19' or i[:2] == '20') and len(i) == 4) or (i.isalpha() and len(i) > 3 and len(i) < 11 and containsVowel(i) and not isAllVowel(i)):
                temp['tagValue'].append(i)
                temp['tagID'].append(content['tagID'])

final_df_tags = pd.DataFrame(temp)

len(final_df_tags['tagValue'].unique())

6036

In [27]:
artist_tags = df_user_taggedartists.drop(columns=['userID', 'day', 'month', 'year']).join(final_df_tags.set_index('tagID'), on='tagID').dropna()
# artist_tags.groupby('artistID').count().sort_values(by='tagID', ascending=False)

In [28]:
artist_corpus = artist_tags.groupby('artistID')['tagValue'].apply(','.join).reset_index()
artist_corpus = artist_corpus[pd.DataFrame(artist_corpus['artistID'].tolist()).isin(list(mapping_artist.keys())).any(1)]
len(artist_corpus['artistID'].unique())

9370

In [29]:
# add empty id

idx_exist = list(artist_corpus['artistID'].unique())
idx_check = list(mapping_artist.keys())
for i in idx_check:
    
    if i not in idx_exist:
        artist_corpus = artist_corpus.append({'artistID': i, 'tagValue': ''}, ignore_index=True)
        
len(artist_corpus['artistID'].unique())

12209

In [30]:
artist_corpus['artistID'] = artist_corpus['artistID'].map(lambda x: mapping_artist[x])
artist_corpus = artist_corpus.sort_values('artistID').reset_index(drop=True)

In [31]:
artist_corpus

Unnamed: 0,artistID,tagValue
0,0,"wave,wave,wave,electronic,british,beautiful,lo..."
1,1,"chillout,downtempo,electronic,trip,female,vova..."
2,2,"electronic,autumn,downtempo,electronic,trip,da..."
3,3,"relax,electronic,female,vocalist,trip,chillout..."
4,4,"seen,live,electronic,dance,dance,female,vocali..."
...,...,...
12204,12204,
12205,12205,downtempo
12206,12206,"electronic,downtempo,vfsix"
12207,12207,"drum,bass,liquid,funk"


In [32]:
def searchByValue(dictionary, x):
    for key, value in dictionary.items():
        if value == x:
            return key

In [33]:
# Create TF-IDF Context Vector for item features
# Set dimension to 25

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(artist_corpus['tagValue'])
# print(vectorizer.vocabulary_.keys())
# print(vectorizer.get_feature_names())
# print(X.shape)
# print(X[0])

df_items = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

pca = PCA(n_components = 25)
final_item = pca.fit_transform(df_items)
pd.DataFrame(final_item)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.020682,-0.081229,-0.002902,-0.021293,-0.000209,0.096735,-0.084007,-0.077466,0.111649,-0.121347,...,0.105891,0.177286,-0.186743,-0.037070,0.080153,0.126405,-0.060539,-0.001755,-0.001926,-0.047913
1,-0.134205,-0.143446,0.097701,0.131950,-0.066020,0.265616,0.045124,0.273804,-0.177866,-0.059126,...,0.038957,0.443554,0.160021,-0.290982,0.073566,0.002657,-0.127503,0.219943,-0.092763,0.007949
2,-0.122967,-0.182464,0.082962,-0.076063,-0.225437,0.615120,0.066081,0.393686,-0.094136,-0.095718,...,-0.008933,0.046235,0.056409,-0.062520,-0.019454,-0.015348,-0.069595,0.057661,0.041278,0.023548
3,-0.092207,-0.158967,0.150027,0.122702,-0.048973,0.294117,0.069315,0.260993,-0.161563,-0.018476,...,-0.006134,0.393886,0.145437,-0.224400,0.030726,-0.044983,-0.106564,0.191609,-0.078472,-0.007380
4,-0.093490,-0.223318,0.302608,0.374711,0.047500,0.371016,0.009726,-0.257327,-0.014887,-0.228411,...,0.020294,-0.118911,0.000442,-0.226355,0.126207,-0.013219,0.002936,-0.032635,-0.077303,0.006982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12204,-0.103218,-0.026448,-0.055338,-0.027015,-0.025997,-0.059183,-0.016063,-0.021892,-0.021382,0.011522,...,-0.011028,0.006853,0.015438,0.006751,-0.001771,-0.017177,0.001739,-0.010851,0.001092,-0.008119
12205,-0.123178,-0.053858,-0.049544,-0.031598,-0.075154,0.042186,0.007270,0.180510,-0.140786,-0.022727,...,0.014163,0.301541,0.070584,-0.211017,0.022104,-0.045432,-0.084023,0.079590,-0.074187,0.004788
12206,-0.134950,-0.088304,-0.012816,-0.037036,-0.113622,0.257490,0.004537,0.092940,-0.033786,-0.075811,...,-0.037169,0.131069,0.096226,-0.054338,-0.029862,-0.036518,-0.032690,0.073574,-0.006667,0.023897
12207,-0.107856,-0.032076,-0.058335,-0.023116,-0.034906,-0.052212,-0.018922,-0.021644,-0.028552,-0.001585,...,-0.008600,-0.024276,0.020479,-0.001542,0.003723,-0.044911,-0.028134,0.007264,0.084322,0.038435


In [34]:
# Export to npy
np.save('data/lastfm_item_feature.npy', final_item)

In [35]:
top_artist = artist_tags.groupby('artistID').size().sort_values(ascending=False).index.values
new_top_artist = []
for i in top_artist:
    if i in list(mapping_artist.keys()) and len(new_top_artist) < 1000:
        new_top_artist.append(i)

final_popular_artist = pd.Series(new_top_artist).apply(lambda x: mapping_artist[x])

len(final_popular_artist)

1000

In [38]:
# Export to npy
np.save('data/lastfm_popular_item.npy', final_popular_artist)