# Clean Dataset and create subset for KG creation

In [62]:
ratio = .75
name = '75percent_subset'

In [63]:
import pandas as pd
import numpy as np
import csv
import time
import sys
import json
import joblib
import os

from tqdm import tqdm

In [64]:
users = pd.read_csv('data/lfm/users.tsv', sep='\t')
tracks = pd.read_csv('data/lfm/tracks.tsv', sep='\t')
listening_events = pd.read_csv('data/lfm/listening_events.tsv', sep='\t')
albums = pd.read_csv('data/lfm/albums.tsv', sep='\t')

In [65]:
len(tracks)

4079421

In [66]:
len(listening_events)

30357786

## Clean Dataset

<b> Remove users which gender is not defined and whose age is not element of [10;90] </b>

In [67]:
l = len(users)
users = users[users.gender.isin(['m', 'f']) & (users.age >= 10) & (users.age <= 95)]
print('Removed %s users' % (len(users) - l))

Removed -5163 users


<b> Remove now unassigned listening events </b>

In [68]:
l = len(listening_events)
listening_events = listening_events[listening_events.user_id.isin(users.user_id)]
print('Removed %s listening events' % (len(listening_events) - l))

Removed -9725559 listening events


<b> remove duplicate listening events </b>

In [69]:
listening_events = listening_events.sort_values('timestamp')
old_length_le = len(listening_events)
listening_events = listening_events.drop_duplicates(subset=['user_id', 'track_id'])
print(len(listening_events)-old_length_le)

-10636671


<b> 10-core filtering </b>

In [70]:
n_core = 5
while True:
    start_number = len(listening_events)

    # Item pass
    item_counts = listening_events.track_id.value_counts()
    item_above = set(item_counts[item_counts >= n_core].index)
    listening_events = listening_events[listening_events.track_id.isin(item_above)]
    print('Records after item pass: ', len(listening_events))

    # User pass
    user_counts = listening_events.user_id.value_counts()
    user_above = set(user_counts[user_counts >= n_core].index)
    listening_events = listening_events[listening_events.user_id.isin(user_above)]
    print('Records after user pass: ', len(listening_events))

    if len(listening_events) == start_number:
        print('Exiting...')
        break

Records after item pass:  6213108
Records after user pass:  6212621
Records after item pass:  6212421
Records after user pass:  6212421
Records after item pass:  6212421
Records after user pass:  6212421
Exiting...


In [71]:
users = users[users.user_id.isin(set(listening_events.user_id))]
tracks = tracks[tracks.track_id.isin(set(listening_events.track_id))]

In [72]:
users.sample(1)

Unnamed: 0,user_id,country,age,gender,creation_time
7814,28457,BR,21,m,2008-12-30 05:46:26


## Create a subset given a ratio

In [73]:
n = ratio*len(tracks)
n = int(np.ceil(n))
n

278646

In [74]:
tracks_subset = tracks.sample(n)
tracks_subset.to_csv('data/lfm/%s_tracks.tsv' % name, sep='\t', index=False)
tracks_subset.head()

Unnamed: 0,track_id,artist,track
2915403,36009223,Far From Alaska,Slug
1666621,21227870,Black Light Burns,It's Good To Be Gold
2498943,31113571,Eydie Gormé,Piel Canela
3071783,37868019,Oakwood Station,Summer Breeze Memories
2011736,25351420,LANY,Made in Hollywood


In [75]:
listening_events_subset = listening_events[listening_events['track_id'].isin(tracks_subset['track_id'])]
listening_events_subset.to_csv('data/lfm/%s_listening_events.tsv' % name, sep='\t', index=False)
listening_events_subset

Unnamed: 0,user_id,track_id,album_id,timestamp
2,16026,40012596,18558010,2020-01-01 00:00:01
14,14807,21889387,16391976,2020-01-01 00:00:03
13,23625,16551011,9087976,2020-01-01 00:00:03
12,24008,39377924,19382688,2020-01-01 00:00:03
16,53114,17002235,19089164,2020-01-01 00:00:04
...,...,...,...,...
30357730,27180,30303658,7821795,2020-03-20 12:59:50
30357760,865,17367418,22700688,2020-03-20 12:59:55
30357779,59921,42135253,21202843,2020-03-20 12:59:58
30357776,48354,24730041,13118529,2020-03-20 12:59:58


In [76]:
users_subset = users[users['user_id'].isin(listening_events_subset['user_id'].unique())]
users_subset.to_csv('data/lfm/%s_users.tsv' % name, sep='\t', index=False)
users_subset

Unnamed: 0,user_id,country,age,gender,creation_time
0,2,UK,35,m,2002-10-29 01:00:00
2,14,UK,48,m,2003-02-18 21:44:13
3,15,US,28,m,2003-02-24 03:30:33
7,36,UK,29,m,2003-03-27 12:17:12
8,42,RU,30,m,2003-04-10 01:07:48
...,...,...,...,...,...
15242,119397,SK,31,m,2012-05-25 17:41:55
15243,119516,RU,19,m,2012-05-26 10:23:12
15252,119957,PL,16,f,2012-05-28 19:59:37
15254,120095,RU,20,f,2012-05-29 21:19:14


In [77]:
albums_subset = albums[albums['album_id'].isin(listening_events_subset['album_id'].unique())]
albums_subset.to_csv('data/lfm/%s_albums.tsv' % name, sep='\t', index=False)
albums_subset

Unnamed: 0,album_id,artist,album
2,167,!!!,
197,4152,Toss A Coin To Your Witcher Female Cover by Ra...,
202,4323,Weird Al Yankovic,
403,7963,$ubjectz,
405,7982,$uicideboy$,
...,...,...,...
1676876,24227802,방탄소년단,화양연화 pt.2
1676912,24229701,DIR EN GREY,ＡＲＣＨＥ Disc 1
1676924,24230497,the GazettE,ＤＯＧＭＡ
1676954,24232012,DIR EN GREY,ＭＡＣＡＢＲＥ


# Create KG files

In [78]:
#csv.field_size_limit(sys.maxsize)

## Entities

- user
- artist
- album
- track
- genre

**Additional information:**
- gender
- country

#### Users

In [79]:
path = 'data/lfm'
path_prefix = 'data/lfm/intermediate_kg'
read_path_prefix = 'data/lfm/%s' % name
write_path_prefix = 'data/lfm/intermediate_kg/%s' % name

In [80]:
# user_id, country, age, gender, creation_time
with open("%s_users.tsv" % read_path_prefix, "r", encoding='utf-8') as data, open("%s_users.txt" % write_path_prefix, "w") as f:
    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
    
    for i, row in tqdm(enumerate(datareader)):
        if i == 0:
            header = row[0].split(sep="\t")
        if i != 0:
            entry = row[0].split(sep="\t")
            f.writelines("u" + entry[0] + "\t" + entry[1] + "\t" + entry[2] + "\t" + entry[3] + "\n")
            
time.sleep(0.5)            

print(header)
print("done.")

9725it [00:00, 528349.09it/s]


['user_id', 'country', 'age', 'gender', 'creation_time']
done.


#### Artists

In [81]:
# artist_id, artist_name
#with open("data/artists.tsv", "r", encoding='utf-8') as data, open("data/kg/artists.txt", "w") as f:
#    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
#    
#    for i, row in tqdm(enumerate(datareader)):
#        if i == 0:
#            header = row[0].split(sep="\t")
#        if i != 0:
#            entry = row[0].split(sep="\t")
#            f.writelines("a" + entry[0] + "\t" + str(entry[1].encode("utf-8")) + "\n")
#            
#time.sleep(0.5)            
#
#print(header)
#print("done.")

#### Albums

In [82]:
# album_id, album_name, artist_name
#with open("data/albums.tsv", "r", encoding='utf-8') as data, open("data/kg/albums.txt", "w") as f:
#    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
#    
#    for i, row in tqdm(enumerate(datareader)):
#        if i == 0:
#            header = row[0].split(sep="\t")
#        if i != 0:
#            entry = row[0].split(sep="\t")
#            f.writelines("b" + entry[0] + "\t" + str(entry[1].encode("utf-8")) +  "\n")
#            
#time.sleep(0.5)
#
#print(header)
#print("done.")

#### Tracks

In [83]:
# track_id, artist_name, track_name
track_complete_dict = {}
counter = 0
with open("%s_tracks.tsv" % read_path_prefix, "r", encoding='utf-8') as data, open("%s_tracks.txt" % write_path_prefix, "w") as f:
    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
    
    for i, row in tqdm(enumerate(datareader)):
        if i == 0:
            header = row[0].split(sep="\t")
        if i != 0:
            entry = row[0].split(sep="\t")
            try:
                f.writelines("t" + entry[0] + "\t" + str(entry[2].encode("utf-8")) + "\n")
                track_id = "t" + entry[0]
                if track_id not in track_complete_dict:
                    track_complete_dict[track_id] = 0
            except:
                f.writelines("t" + entry[0] + "\t" + "???" + "\n")
                counter += 1
            
time.sleep(0.5)            

print(header)
print("done.")
print(counter)

278647it [00:00, 311645.27it/s]


['track_id', 'artist', 'track']
done.
783


In [84]:
print(len(list(track_complete_dict.keys())))

277863


#### Genres

In [85]:
#counter = 0
#with open("data/micro_genre_tags.json", "r", encoding='utf-8') as data, open("data/kg/genres.txt", "w") as f:
#    for line in tqdm(data):
#        try:
#            entry = json.loads(line)
#            track_name = entry['_id']['track']
#            artist_name = entry['_id']['artist']
#            main_genre = list(entry['tags'].keys())[0]
#            genres = ",".join(list(entry['tags'].keys()))
#
#            f.writelines(track_name + "\t" + artist_name + "\t" + main_genre + "\t" + genres + "\n")
#        except:
#            counter += 1
#            continue
#            
#time.sleep(0.5)            
#
#print(header)
#print("done.")
#print(counter)

#### Gender & Country

In [86]:
gdict = {}
cdict = {}

gcounter = 0
ccounter = 0

with open("%s_users.txt" % write_path_prefix, "r") as data,\
open("%s/entities/%s_gender.txt"  % (path_prefix, name), "w") as f,\
open("%s/entities/%s_country.txt" % (path_prefix, name), "w") as f2:
    datareader = csv.reader(data)
    
    for i, row in tqdm(enumerate(datareader)):
        entry = row[0].split(sep="\t")
        if entry[3] == "" or entry[3] in gdict:
            gcounter += 1
        else:
            gdict[entry[3]] = 0
            f.writelines(entry[3] + "\n")
            
        if entry[1] == "" or entry[1] in cdict:
            ccounter += 1
        else:
            cdict[entry[1]] = 0
            f2.writelines(entry[1] + "\n")
            
time.sleep(0.5)
            
print("done.")
print(gcounter)
print(ccounter)

9724it [00:00, 1137256.00it/s]


done.
9722
9600


#### Country

In [87]:
# DONE

## Relations

- **listened_to:** _user_ listened_to _track_
- **in_album:** _track_ in_album _album_
- **created_by:** _track_ created_by _artist_
- **has_genre**: _track_ has_genre _genre_

**Additional information:**
- **has_micro_genre**: _track_ has_micro_genre _genre_ (**or** instead of has_genre)
- **has_gender**: _user_ has_gender _gender_
- **lives_in**: _user_ lives_in _country_

In [88]:
# check genres.txt --> clean track and artist names, we filter it with that list?
# create new dict mit track names, artist names and look up for all the relations

# track_name, artist_name, main_genre, micro_genre

#track_dict = {}

#with open("data/kg/genres.txt", "r") as data: #, open("data/kg/tracks.txt", "w") as f:
#    datareader = csv.reader(data)
    
#    for i, row in tqdm(enumerate(datareader)):
#        try:
#            helper = row[0].split("\t")
#            track_name = helper[0]
#            artist_name = helper[1]
#            main_genre = helper[2]
#            micro_genre = row[1:]
#        except:
#            row = ", ".join(row)
#            helper = row.split("\t")
            
#            track_name = helper[0]
#            artist_name = helper[1]
#            main_genre = helper[2]
#            micro_genre = helper[3].split(",")[1:]
#            continue
        
#        key = track_name # + " || " + artist_name
        
#        if key not in track_dict:
#            track_dict[key] = {}
#            track_dict[key]["artist"] = artist_name
#            track_dict[key]["main_genre"] = main_genre
#            track_dict[key]["micro_genre"] = micro_genre
        
#print("saving...")

#joblib.dump(track_dict, "data/kg/track_dict.pkl")

#print("done.")

#### listened_to

In [89]:
# user_id, track_id, album_id, timestamp
counter = 0
counts_dict = {}
track_dict = {}
with open("%s_listening_events.tsv" % (read_path_prefix), "r") as data, open("%s/relations/%s_listened_to.txt" % (path_prefix, name), "w") as f:
    datareader = csv.reader(data)
    
    for i, row in tqdm(enumerate(datareader)):
        if i == 0:
            header = row[0].split(sep="\t")
        if i != 0:
            entry = row[0].split(sep="\t")
            
            track_id = "t" + entry[1]
            if track_id not in track_dict:
                track_dict[track_id] = 0
            
            key = "u" + entry[0] + ", " + "listened_to" + ", " + "t" + entry[1]
            if key not in counts_dict:
                counts_dict[key] = 0
                f.writelines("u" + entry[0] + "\t" + "listened_to" + "\t" + "t" + entry[1] + "\n")
            else:
                counts_dict[key] += 1
                
            #try:
            #    f.writelines("u" + entry[0] + "\t" + "listened_to" + "\t" + "t" + entry[1] + "\n")
            #except:
            #    counter += 1
            #    continue
            
time.sleep(0.5)            

print(header)
print("done.")
print(counter)

4636769it [00:15, 305833.70it/s]


['user_id', 'track_id', 'album_id', 'timestamp']
done.
0


In [90]:
print(len(list(track_dict.keys())))

278646


#### created_by & has_genre & has_micro_genre & in_album

In [91]:
counter = 0

artist_dict = {}
album_dict = {}
type_dict = {}
genre_dict = {}
check_dict = {}

with open("%s/micro_genre_tags_new.json" % path, "r", encoding='utf-8') as data,\
open("%s/relations/%s_has_genre.txt" % (path_prefix, name), "w", encoding='utf-8') as f1,\
open("%s/relations/%s_has_micro_genre.txt" % (path_prefix, name), "w", encoding='utf-8') as f2,\
open("%s/relations/%s_in_album.txt" % (path_prefix, name), "w", encoding='utf-8') as f3,\
open("%s/relations/%s_created_by.txt" % (path_prefix, name), "w", encoding='utf-8') as f4,\
open("%s/entities/%s_artist.txt" % (path_prefix, name), "w", encoding='utf-8') as e1,\
open("%s/entities/%s_album.txt" % (path_prefix, name), "w", encoding='utf-8') as e2,\
open("%s/entities/%s_artist_type.txt" % (path_prefix, name), "w", encoding='utf-8') as e3,\
open("%s/entities/%s_genre.txt" % (path_prefix, name), "w", encoding='utf-8') as e4,\
open("%s/entities/%s_track.txt" % (path_prefix, name), "w", encoding='utf-8') as t: #?????????
    for line in tqdm(data):
        entry = json.loads(line)
        #print(entry)
        track_id = str(entry['_id'])
        track_name = str(entry['track']['track']) #?????????
        main_genre = str(list(entry['tags'].keys())[0])
        micro_genre = list(entry['tags'].keys())[1:]
        
        artist_id = str(entry['artist']['artist_id'])
        artist_name = str(entry['artist']['artist'])
        artist_gender = str(entry['artist']['gender'])
        artist_type = str(entry['artist']['type'])
        
        album_id = str(entry['album']['album_id'])
        album_name = str(entry['album']['album'])
        
        check = "t" + track_id
        if check in track_dict:
            if check not in check_dict: #?????????
                check_dict[check] = 0
                track_dict[check] = track_name
                t.writelines(check + "\t" + track_name + "\n")
                
            if artist_id not in artist_dict:
                artist_dict[artist_id] = 0
                e1.writelines("a" + artist_id + "\t" + artist_name + "\t" + artist_gender + "\t" + artist_type + "\n")

            if album_id not in album_dict:
                album_dict[album_id] = 0
                e2.writelines("b" + album_id + "\t" + album_name + "\n")

            if artist_type not in type_dict:
                type_dict[artist_type] = 0
                e3.writelines(artist_type + "\n")

            if main_genre not in genre_dict:
                genre_dict[main_genre] = 0
                e4.writelines(main_genre + "\n")

            for micro in micro_genre:
                m = str(micro)
                if m not in genre_dict:
                    genre_dict[m] = 0
                    e4.writelines(m + "\n")

            created_by = "t" + track_id + "\t" + "created_by" + "\t" + "a" + artist_id + "\n"
            f4.writelines(created_by)
            #print(created_by)
            has_genre = "t" + track_id + "\t" + "has_genre" + "\t" + main_genre + "\n"
            f1.writelines(has_genre)
            #print(has_genre)
            for micro in micro_genre:
                has_micro_genre = "t" + track_id + "\t" + "has_micro_genre" + "\t" + micro + "\n"
                f2.writelines(has_micro_genre)
            #    print(has_micro_genre)
            in_album = "t" + track_id + "\t" + "in_album" + "\t" + "b" + album_id + "\n"
            f3.writelines(in_album)
            #print(in_album)
        else:
            counter += 1
            
time.sleep(0.5)            

print("done.")
print(counter)

3602111it [00:27, 132209.71it/s]


done.
3443471


In [92]:
print(len(list(check_dict.keys())))

158640


In [93]:
with open("%s/entities/%s_track_new.txt" % (path_prefix, name), "w", encoding='utf-8') as t:
    for key, value in tqdm(track_dict.items()):
        track_id = key
        track_name = value
        
        if track_name == 0:
            track_name = "???"
            
        t.writelines(track_id + "\t" + track_name + "\n")
        
time.sleep(0.5)
        
print("done.")

100%|███████████████████████████████| 278646/278646 [00:00<00:00, 712102.01it/s]


done.


#### in_album

In [94]:
# DONE

#### has_genre

In [95]:
# DONE

#### has_micro_genre

In [96]:
# DONE

#### has_gender & lives_in

In [97]:
# user_id, country, age, gender, creation_time
gcounter = 0
ccounter = 0
with open("%s_users.txt" % write_path_prefix, "r") as data,\
open("%s/relations/%s_has_gender.txt" % (path_prefix, name), "w") as f,\
open("%s/relations/%slives_in.txt" % (path_prefix, name), "w") as f2:
    datareader = csv.reader(data)
    
    for i, row in tqdm(enumerate(datareader)):
        entry = row[0].split(sep="\t")
        if entry[3] == "":
            gcounter += 1
        else:
            f.writelines(entry[0] + "\t" + "has_gender" + "\t" + entry[3] + "\n")
            
        if entry[1] == "":
            ccounter += 1
        else:
            f2.writelines(entry[0] + "\t" + "lives_in" + "\t" + entry[1] + "\n")
            
time.sleep(0.5)
print("done.")
print(gcounter)
print(ccounter)

9724it [00:00, 310632.39it/s]


done.
0
367


#### lives_in

In [98]:
# DONE

## Create KG File

In [99]:
directory = '%s/relations' % path_prefix
 
with open('%s_kg.txt' % write_path_prefix, "w") as kg:
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        with open(f, "r") as rel:
            datareader = csv.reader(rel)
            for i, row in tqdm(enumerate(datareader), desc=filename):
                kg.writelines(row[0] + "\n")

75percent_subset_has_micro_genre.txt: 773030it [00:01, 436119.30it/s]
75percent_subset_has_gender.txt: 9724it [00:00, 680460.01it/s]
75percent_subset_listened_to.txt: 4636768it [00:08, 537674.81it/s]
75percent_subsetlives_in.txt: 9357it [00:00, 690150.57it/s]
75percent_subset_created_by.txt: 158640it [00:00, 525280.52it/s]
75percent_subset_in_album.txt: 158640it [00:00, 535535.29it/s]
75percent_subset_has_genre.txt: 158640it [00:00, 517142.41it/s]


In [100]:
directory = '%s/relations' % path_prefix
 
with open('%s_kg_no_gender.txt' % write_path_prefix, "w") as kg:
    for filename in os.listdir(directory):
        if filename == 'has_gender.txt':
            continue
        f = os.path.join(directory, filename)
        with open(f, "r") as rel:
            datareader = csv.reader(rel)
            for i, row in tqdm(enumerate(datareader), desc=filename):
                kg.writelines(row[0] + "\n")

75percent_subset_has_micro_genre.txt: 773030it [00:01, 445357.56it/s]
75percent_subset_has_gender.txt: 9724it [00:00, 670757.54it/s]
75percent_subset_listened_to.txt: 4636768it [00:08, 541859.92it/s]
75percent_subsetlives_in.txt: 9357it [00:00, 681639.96it/s]
75percent_subset_created_by.txt: 158640it [00:00, 530000.10it/s]
75percent_subset_in_album.txt: 158640it [00:00, 544513.77it/s]
75percent_subset_has_genre.txt: 158640it [00:00, 507301.60it/s]


# Create the dataset files for RecBole

### Dataset structure
Recbole knowledge aware datasets require three files: .inter, .g, .link files. 

| File | Description |
|------|-------------|
|.inter|User-Item interaction|
|.kg| head, relation, tail|
|.link|item_id to entity_id|

## Create atomic files

 Add type info to kg generated by David

In [101]:
dataset = name
path = 'data/lfm'
kg_path = '%s/intermediate_kg/%s' % (path, dataset)
prefixed_path = '%s/%s' % (path,dataset)

In [102]:
kg = pd.read_csv('%s_kg.txt' % kg_path, sep='\t', names=['head_id:token', 'relation_id:token', 'tail_id:token'])

In [103]:
kg.head()

Unnamed: 0,head_id:token,relation_id:token,tail_id:token
0,t36346257,has_micro_genre,indie pop
1,t36346257,has_micro_genre,rock
2,t36346257,has_micro_genre,singer-songwriter
3,t36346257,has_micro_genre,indie rock
4,t36346257,has_micro_genre,art pop


In [104]:
kg['relation_id:token'].unique()

array(['has_micro_genre', 'has_gender', 'listened_to', 'lives_in',
       'created_by', 'in_album', 'has_genre'], dtype=object)

In [105]:
kg_no_listen_events = kg[kg['relation_id:token'] != 'listened_to']
kg_no_listen_events.to_csv('data/rb_lfm/rb_lfm.kg', sep='\t', index=False)

In [106]:
users = pd.read_csv('%s_users.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['user_id:token', 'country:token', 'age:token', 'gender:token', 'creation_time:token'])
users.to_csv('data/rb_lfm/rb_lfm.user', sep='\t', index=False)

In [107]:
len(users)

9724

In [108]:
items = pd.read_csv('%s_tracks.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['item_id:token', 'artist:token', 'track:token'])
items.to_csv('data/rb_lfm/rb_lfm.item', sep='\t', index=False)
                    
track_ids = pd.DataFrame(items['item_id:token'])
track_ids['entity_id:token'] = 't' + track_ids['item_id:token'].astype(str)
track_ids.to_csv('data/rb_lfm/rb_lfm.link', sep='\t', index=False)

In [109]:
listening_events = pd.read_csv('%s_listening_events.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['user_id:token', 'item_id:token', 'album_id:token', 'timestamp:token'])
listening_events.to_csv('data/rb_lfm/rb_lfm.inter', sep='\t', index=False)