# Clean Dataset and create subset for KG creation

In [1]:
ratio = .5
name = '50percent_subset'

In [2]:
import pandas as pd
import numpy as np
import csv
import time
import sys
import json
import joblib
import os

from tqdm import tqdm

In [3]:
users = pd.read_csv('data/lfm/users.tsv', sep='\t')
tracks = pd.read_csv('data/lfm/tracks.tsv', sep='\t')
listening_events = pd.read_csv('data/lfm/listening_events.tsv', sep='\t')
albums = pd.read_csv('data/lfm/albums.tsv', sep='\t')

In [4]:
tracks.head()

Unnamed: 0,track_id,artist,track
0,138,Böhse Onkelz,!
1,155,Global Goon,!
2,159,Hellyeah,!
3,174,Kylie Minoise,!
4,193,Nogu Svelo!,!


## Clean Dataset

<b> Remove users which gender is not defined and whose age is not element of [10;90] </b>

In [5]:
l = len(users)
users = users[users.gender.isin(['m', 'f']) & (users.age >= 10) & (users.age <= 95)]
print('Removed %s users' % (len(users) - l))

Removed -5163 users


<b> Remove now unassigned listening events </b>

In [6]:
l = len(listening_events)
listening_events = listening_events[listening_events.user_id.isin(users.user_id)]
print('Removed %s listening events' % (len(listening_events) - l))

Removed -9725559 listening events


<b> 10-core filtering </b>

In [7]:
n_core = 5
while True:
    start_number = len(listening_events)

    # Item pass
    item_counts = listening_events.track_id.value_counts()
    item_above = set(item_counts[item_counts >= n_core].index)
    listening_events = listening_events[listening_events.track_id.isin(item_above)]
    print('Records after item pass: ', len(listening_events))

    # User pass
    user_counts = listening_events.user_id.value_counts()
    user_above = set(user_counts[user_counts >= n_core].index)
    listening_events = listening_events[listening_events.user_id.isin(user_above)]
    print('Records after user pass: ', len(listening_events))

    if len(listening_events) == start_number:
        print('Exiting...')
        break

Records after item pass:  16789691
Records after user pass:  16789298
Records after item pass:  16789169
Records after user pass:  16789169
Records after item pass:  16789169
Records after user pass:  16789169
Exiting...


In [8]:
users = users[users.user_id.isin(set(listening_events.user_id))]
tracks = tracks[tracks.track_id.isin(set(listening_events.track_id))]

In [9]:
users.sample(1)

Unnamed: 0,user_id,country,age,gender,creation_time
13900,76470,BR,28,f,2011-11-23 12:06:21


## Create a subset given a ratio

In [10]:
n = ratio*len(tracks)
n = int(np.ceil(n))
n

351013

In [11]:
tracks_subset = tracks.sample(n)
tracks_subset.to_csv('data/lfm/%s_tracks.tsv' % name, sep='\t', index=False)
tracks_subset.head()

Unnamed: 0,track_id,artist,track
1428788,18526587,Moloko,Ho Humm
1950992,24658968,NOVAMENCO,Lorca
3411585,41657773,Titãs,Toda Cor
3467748,42318836,Stam1na,Tsunami - Remastered
2324052,29036145,O Rappa,Não Vão Me Matar


In [12]:
listening_events_subset = listening_events[listening_events['track_id'].isin(tracks_subset['track_id'])]
listening_events_subset.to_csv('data/lfm/%s_listening_events.tsv' % name, sep='\t', index=False)
listening_events_subset

Unnamed: 0,user_id,track_id,album_id,timestamp
3,42410,14250568,8916172,2020-01-01 00:00:01
19,35601,21168639,18691627,2020-01-01 00:00:05
20,60352,30828639,11273927,2020-01-01 00:00:06
30,18190,9665380,6880873,2020-01-01 00:00:08
34,15157,28672519,14898006,2020-01-01 00:00:09
...,...,...,...,...
30357771,56985,32555849,13969153,2020-03-20 12:59:57
30357776,48354,24730041,13118529,2020-03-20 12:59:58
30357777,15709,28558946,14874786,2020-03-20 12:59:58
30357779,59921,42135253,21202843,2020-03-20 12:59:58


In [13]:
users_subset = users[users['user_id'].isin(listening_events_subset['user_id'].unique())]
users_subset.to_csv('data/lfm/%s_users.tsv' % name, sep='\t', index=False)
users_subset

Unnamed: 0,user_id,country,age,gender,creation_time
0,2,UK,35,m,2002-10-29 01:00:00
2,14,UK,48,m,2003-02-18 21:44:13
3,15,US,28,m,2003-02-24 03:30:33
7,36,UK,29,m,2003-03-27 12:17:12
8,42,RU,30,m,2003-04-10 01:07:48
...,...,...,...,...,...
15242,119397,SK,31,m,2012-05-25 17:41:55
15243,119516,RU,19,m,2012-05-26 10:23:12
15252,119957,PL,16,f,2012-05-28 19:59:37
15254,120095,RU,20,f,2012-05-29 21:19:14


In [14]:
albums_subset = albums[albums['album_id'].isin(listening_events_subset['album_id'].unique())]
albums_subset.to_csv('data/lfm/%s_albums.tsv' % name, sep='\t', index=False)
albums_subset

Unnamed: 0,album_id,artist,album
2,167,!!!,
27,1044,Blue Gene Tyranny,
35,1196,Changing of the Seasons,
73,1933,Heart Attack,
136,2931,No No Play Safe Song | +More Nursery Rhymes & ...,
...,...,...,...
1676954,24232012,DIR EN GREY,ＭＡＣＡＢＲＥ
1676977,24233751,DIR EN GREY,ＴＨＥ　ＵＮＲＡＶＥＬＩＮＧ
1676979,24234103,ONE OK ROCK,Ｔｈｅ　Ｂｅｇｉｎｎｉｎｇ
1676986,24234395,DIR EN GREY,ＶＵＬＧＡＲ


# Create KG files

In [15]:
#csv.field_size_limit(sys.maxsize)

## Entities

- user
- artist
- album
- track
- genre

**Additional information:**
- gender
- country

#### Users

In [16]:
path = 'data/lfm'
path_prefix = 'data/lfm/intermediate_kg'
read_path_prefix = 'data/lfm/%s' % name
write_path_prefix = 'data/lfm/intermediate_kg/%s' % name

In [17]:
# user_id, country, age, gender, creation_time
with open("%s_users.tsv" % read_path_prefix, "r", encoding='utf-8') as data, open("%s_users.txt" % write_path_prefix, "w") as f:
    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
    
    for i, row in tqdm(enumerate(datareader)):
        if i == 0:
            header = row[0].split(sep="\t")
        if i != 0:
            entry = row[0].split(sep="\t")
            f.writelines("u" + entry[0] + "\t" + entry[1] + "\t" + entry[2] + "\t" + entry[3] + "\n")
            
time.sleep(0.5)            

print(header)
print("done.")

9801it [00:00, 420976.69it/s]


['user_id', 'country', 'age', 'gender', 'creation_time']
done.


#### Artists

In [18]:
# artist_id, artist_name
#with open("data/artists.tsv", "r", encoding='utf-8') as data, open("data/kg/artists.txt", "w") as f:
#    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
#    
#    for i, row in tqdm(enumerate(datareader)):
#        if i == 0:
#            header = row[0].split(sep="\t")
#        if i != 0:
#            entry = row[0].split(sep="\t")
#            f.writelines("a" + entry[0] + "\t" + str(entry[1].encode("utf-8")) + "\n")
#            
#time.sleep(0.5)            
#
#print(header)
#print("done.")

#### Albums

In [19]:
# album_id, album_name, artist_name
#with open("data/albums.tsv", "r", encoding='utf-8') as data, open("data/kg/albums.txt", "w") as f:
#    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
#    
#    for i, row in tqdm(enumerate(datareader)):
#        if i == 0:
#            header = row[0].split(sep="\t")
#        if i != 0:
#            entry = row[0].split(sep="\t")
#            f.writelines("b" + entry[0] + "\t" + str(entry[1].encode("utf-8")) +  "\n")
#            
#time.sleep(0.5)
#
#print(header)
#print("done.")

#### Tracks

In [20]:
# track_id, artist_name, track_name
track_complete_dict = {}
counter = 0
with open("%s_tracks.tsv" % read_path_prefix, "r", encoding='utf-8') as data, open("%s_tracks.txt" % write_path_prefix, "w") as f:
    datareader = csv.reader(data, quoting=csv.QUOTE_NONE)
    
    for i, row in tqdm(enumerate(datareader)):
        if i == 0:
            header = row[0].split(sep="\t")
        if i != 0:
            entry = row[0].split(sep="\t")
            try:
                f.writelines("t" + entry[0] + "\t" + str(entry[2].encode("utf-8")) + "\n")
                track_id = "t" + entry[0]
                if track_id not in track_complete_dict:
                    track_complete_dict[track_id] = 0
            except:
                f.writelines("t" + entry[0] + "\t" + "???" + "\n")
                counter += 1
            
time.sleep(0.5)            

print(header)
print("done.")
print(counter)

351015it [00:01, 306964.09it/s]


['track_id', 'artist', 'track']
done.
2080


In [21]:
print(len(list(track_complete_dict.keys())))

348934


#### Genres

In [22]:
#counter = 0
#with open("data/micro_genre_tags.json", "r", encoding='utf-8') as data, open("data/kg/genres.txt", "w") as f:
#    for line in tqdm(data):
#        try:
#            entry = json.loads(line)
#            track_name = entry['_id']['track']
#            artist_name = entry['_id']['artist']
#            main_genre = list(entry['tags'].keys())[0]
#            genres = ",".join(list(entry['tags'].keys()))
#
#            f.writelines(track_name + "\t" + artist_name + "\t" + main_genre + "\t" + genres + "\n")
#        except:
#            counter += 1
#            continue
#            
#time.sleep(0.5)            
#
#print(header)
#print("done.")
#print(counter)

#### Gender & Country

In [23]:
gdict = {}
cdict = {}

gcounter = 0
ccounter = 0

with open("%s_users.txt" % write_path_prefix, "r") as data,\
open("%s/entities/%s_gender.txt"  % (path_prefix, name), "w") as f,\
open("%s/entities/%s_country.txt" % (path_prefix, name), "w") as f2:
    datareader = csv.reader(data)
    
    for i, row in tqdm(enumerate(datareader)):
        entry = row[0].split(sep="\t")
        if entry[3] == "" or entry[3] in gdict:
            gcounter += 1
        else:
            gdict[entry[3]] = 0
            f.writelines(entry[3] + "\n")
            
        if entry[1] == "" or entry[1] in cdict:
            ccounter += 1
        else:
            cdict[entry[1]] = 0
            f2.writelines(entry[1] + "\n")
            
time.sleep(0.5)
            
print("done.")
print(gcounter)
print(ccounter)

9800it [00:00, 1142290.44it/s]


done.
9798
9674


#### Country

In [24]:
# DONE

## Relations

- **listened_to:** _user_ listened_to _track_
- **in_album:** _track_ in_album _album_
- **created_by:** _track_ created_by _artist_
- **has_genre**: _track_ has_genre _genre_

**Additional information:**
- **has_micro_genre**: _track_ has_micro_genre _genre_ (**or** instead of has_genre)
- **has_gender**: _user_ has_gender _gender_
- **lives_in**: _user_ lives_in _country_

In [25]:
# check genres.txt --> clean track and artist names, we filter it with that list?
# create new dict mit track names, artist names and look up for all the relations

# track_name, artist_name, main_genre, micro_genre

#track_dict = {}

#with open("data/kg/genres.txt", "r") as data: #, open("data/kg/tracks.txt", "w") as f:
#    datareader = csv.reader(data)
    
#    for i, row in tqdm(enumerate(datareader)):
#        try:
#            helper = row[0].split("\t")
#            track_name = helper[0]
#            artist_name = helper[1]
#            main_genre = helper[2]
#            micro_genre = row[1:]
#        except:
#            row = ", ".join(row)
#            helper = row.split("\t")
            
#            track_name = helper[0]
#            artist_name = helper[1]
#            main_genre = helper[2]
#            micro_genre = helper[3].split(",")[1:]
#            continue
        
#        key = track_name # + " || " + artist_name
        
#        if key not in track_dict:
#            track_dict[key] = {}
#            track_dict[key]["artist"] = artist_name
#            track_dict[key]["main_genre"] = main_genre
#            track_dict[key]["micro_genre"] = micro_genre
        
#print("saving...")

#joblib.dump(track_dict, "data/kg/track_dict.pkl")

#print("done.")

#### listened_to

In [26]:
# user_id, track_id, album_id, timestamp
counter = 0
counts_dict = {}
track_dict = {}
with open("%s_listening_events.tsv" % (read_path_prefix), "r") as data, open("%s/relations/%s_listened_to.txt" % (path_prefix, name), "w") as f:
    datareader = csv.reader(data)
    
    for i, row in tqdm(enumerate(datareader)):
        if i == 0:
            header = row[0].split(sep="\t")
        if i != 0:
            entry = row[0].split(sep="\t")
            
            track_id = "t" + entry[1]
            if track_id not in track_dict:
                track_dict[track_id] = 0
            
            key = "u" + entry[0] + ", " + "listened_to" + ", " + "t" + entry[1]
            if key not in counts_dict:
                counts_dict[key] = 0
                f.writelines("u" + entry[0] + "\t" + "listened_to" + "\t" + "t" + entry[1] + "\n")
            else:
                counts_dict[key] += 1
                
            #try:
            #    f.writelines("u" + entry[0] + "\t" + "listened_to" + "\t" + "t" + entry[1] + "\n")
            #except:
            #    counter += 1
            #    continue
            
time.sleep(0.5)            

print(header)
print("done.")
print(counter)

8381828it [00:20, 399497.93it/s]


['user_id', 'track_id', 'album_id', 'timestamp']
done.
0


In [27]:
print(len(list(track_dict.keys())))

351013


#### created_by & has_genre & has_micro_genre & in_album

In [28]:
counter = 0

artist_dict = {}
album_dict = {}
type_dict = {}
genre_dict = {}
check_dict = {}

with open("%s/micro_genre_tags_new.json" % path, "r", encoding='utf-8') as data,\
open("%s/relations/%s_has_genre.txt" % (path_prefix, name), "w", encoding='utf-8') as f1,\
open("%s/relations/%s_has_micro_genre.txt" % (path_prefix, name), "w", encoding='utf-8') as f2,\
open("%s/relations/%s_in_album.txt" % (path_prefix, name), "w", encoding='utf-8') as f3,\
open("%s/relations/%s_created_by.txt" % (path_prefix, name), "w", encoding='utf-8') as f4,\
open("%s/entities/%s_artist.txt" % (path_prefix, name), "w", encoding='utf-8') as e1,\
open("%s/entities/%s_album.txt" % (path_prefix, name), "w", encoding='utf-8') as e2,\
open("%s/entities/%s_artist_type.txt" % (path_prefix, name), "w", encoding='utf-8') as e3,\
open("%s/entities/%s_genre.txt" % (path_prefix, name), "w", encoding='utf-8') as e4,\
open("%s/entities/%s_track.txt" % (path_prefix, name), "w", encoding='utf-8') as t: #?????????
    for line in tqdm(data):
        entry = json.loads(line)
        #print(entry)
        track_id = str(entry['_id'])
        track_name = str(entry['track']['track']) #?????????
        main_genre = str(list(entry['tags'].keys())[0])
        micro_genre = list(entry['tags'].keys())[1:]
        
        artist_id = str(entry['artist']['artist_id'])
        artist_name = str(entry['artist']['artist'])
        artist_gender = str(entry['artist']['gender'])
        artist_type = str(entry['artist']['type'])
        
        album_id = str(entry['album']['album_id'])
        album_name = str(entry['album']['album'])
        
        check = "t" + track_id
        if check in track_dict:
            if check not in check_dict: #?????????
                check_dict[check] = 0
                track_dict[check] = track_name
                t.writelines(check + "\t" + track_name + "\n")
                
            if artist_id not in artist_dict:
                artist_dict[artist_id] = 0
                e1.writelines("a" + artist_id + "\t" + artist_name + "\t" + artist_gender + "\t" + artist_type + "\n")

            if album_id not in album_dict:
                album_dict[album_id] = 0
                e2.writelines("b" + album_id + "\t" + album_name + "\n")

            if artist_type not in type_dict:
                type_dict[artist_type] = 0
                e3.writelines(artist_type + "\n")

            if main_genre not in genre_dict:
                genre_dict[main_genre] = 0
                e4.writelines(main_genre + "\n")

            for micro in micro_genre:
                m = str(micro)
                if m not in genre_dict:
                    genre_dict[m] = 0
                    e4.writelines(m + "\n")

            created_by = "t" + track_id + "\t" + "created_by" + "\t" + "a" + artist_id + "\n"
            f4.writelines(created_by)
            #print(created_by)
            has_genre = "t" + track_id + "\t" + "has_genre" + "\t" + main_genre + "\n"
            f1.writelines(has_genre)
            #print(has_genre)
            for micro in micro_genre:
                has_micro_genre = "t" + track_id + "\t" + "has_micro_genre" + "\t" + micro + "\n"
                f2.writelines(has_micro_genre)
            #    print(has_micro_genre)
            in_album = "t" + track_id + "\t" + "in_album" + "\t" + "b" + album_id + "\n"
            f3.writelines(in_album)
            #print(in_album)
        else:
            counter += 1
            
time.sleep(0.5)            

print("done.")
print(counter)

3602111it [00:27, 131969.17it/s]


done.
3445276


In [29]:
print(len(list(check_dict.keys())))

156835


In [30]:
with open("%s/entities/%s_track_new.txt" % (path_prefix, name), "w", encoding='utf-8') as t:
    for key, value in tqdm(track_dict.items()):
        track_id = key
        track_name = value
        
        if track_name == 0:
            track_name = "???"
            
        t.writelines(track_id + "\t" + track_name + "\n")
        
time.sleep(0.5)
        
print("done.")

100%|███████████████████████████████| 351013/351013 [00:00<00:00, 730710.49it/s]


done.


#### in_album

In [31]:
# DONE

#### has_genre

In [32]:
# DONE

#### has_micro_genre

In [33]:
# DONE

#### has_gender & lives_in

In [34]:
# user_id, country, age, gender, creation_time
gcounter = 0
ccounter = 0
with open("%s_users.txt" % write_path_prefix, "r") as data,\
open("%s/relations/%s_has_gender.txt" % (path_prefix, name), "w") as f,\
open("%s/relations/%slives_in.txt" % (path_prefix, name), "w") as f2:
    datareader = csv.reader(data)
    
    for i, row in tqdm(enumerate(datareader)):
        entry = row[0].split(sep="\t")
        if entry[3] == "":
            gcounter += 1
        else:
            f.writelines(entry[0] + "\t" + "has_gender" + "\t" + entry[3] + "\n")
            
        if entry[1] == "":
            ccounter += 1
        else:
            f2.writelines(entry[0] + "\t" + "lives_in" + "\t" + entry[1] + "\n")
            
time.sleep(0.5)
print("done.")
print(gcounter)
print(ccounter)

9800it [00:00, 350281.89it/s]


done.
0
368


#### lives_in

In [35]:
# DONE

## Create KG File

In [36]:
directory = '%s/relations' % path_prefix
 
with open('%s_kg.txt' % write_path_prefix, "w") as kg:
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        with open(f, "r") as rel:
            datareader = csv.reader(rel)
            for i, row in tqdm(enumerate(datareader), desc=filename):
                kg.writelines(row[0] + "\n")

50percent_subset_has_micro_genre.txt: 640917it [00:01, 433719.76it/s]
50percent_subset_listened_to.txt: 3507973it [00:06, 534939.52it/s]
50percent_subset_has_genre.txt: 156835it [00:00, 510191.73it/s]
50percent_subset_has_gender.txt: 9800it [00:00, 658879.20it/s]
50percent_subset_in_album.txt: 156835it [00:00, 535267.41it/s]
50percent_subsetlives_in.txt: 9432it [00:00, 514643.88it/s]
50percent_subset_created_by.txt: 156835it [00:00, 502143.24it/s]


In [37]:
directory = '%s/relations' % path_prefix
 
with open('%s_kg_no_gender.txt' % write_path_prefix, "w") as kg:
    for filename in os.listdir(directory):
        if filename == 'has_gender.txt':
            continue
        f = os.path.join(directory, filename)
        with open(f, "r") as rel:
            datareader = csv.reader(rel)
            for i, row in tqdm(enumerate(datareader), desc=filename):
                kg.writelines(row[0] + "\n")

50percent_subset_has_micro_genre.txt: 640917it [00:01, 436980.40it/s]
50percent_subset_listened_to.txt: 3507973it [00:06, 540294.16it/s]
50percent_subset_has_genre.txt: 156835it [00:00, 523993.99it/s]
50percent_subset_has_gender.txt: 9800it [00:00, 674314.34it/s]
50percent_subset_in_album.txt: 156835it [00:00, 537170.10it/s]
50percent_subsetlives_in.txt: 9432it [00:00, 666127.99it/s]
50percent_subset_created_by.txt: 156835it [00:00, 506380.95it/s]


# Create the dataset files for RecBole

### Dataset structure
Recbole knowledge aware datasets require three files: .inter, .g, .link files. 

| File | Description |
|------|-------------|
|.inter|User-Item interaction|
|.kg| head, relation, tail|
|.link|item_id to entity_id|

## Create atomic files

 Add type info to kg generated by David

In [38]:
dataset = name
path = 'data/lfm'
kg_path = '%s/intermediate_kg/%s' % (path, dataset)
prefixed_path = '%s/%s' % (path,dataset)

In [39]:
kg = pd.read_csv('%s_kg.txt' % kg_path, sep='\t', names=['head_id:token', 'relation_id:token', 'tail_id:token'])

In [40]:
kg.head()

Unnamed: 0,head_id:token,relation_id:token,tail_id:token
0,t31469731,has_micro_genre,indie rock
1,t31469731,has_micro_genre,indie pop
2,t31469731,has_micro_genre,rock
3,t31469731,has_micro_genre,alternative rock
4,t31469731,has_micro_genre,synthpop


In [41]:
kg['relation_id:token'].unique()

array(['has_micro_genre', 'listened_to', 'has_genre', 'has_gender',
       'in_album', 'lives_in', 'created_by'], dtype=object)

In [42]:
kg_no_listen_events = kg[kg['relation_id:token'] != 'listened_to']
kg_no_listen_events.to_csv('data/rb_lfm/rb_lfm.kg', sep='\t', index=False)

In [43]:
users = pd.read_csv('%s_users.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['user_id:token', 'country:token', 'age:token', 'gender:token', 'creation_time:token'])
users.to_csv('data/rb_lfm/rb_lfm.user', sep='\t', index=False)

In [44]:
len(users)

9800

In [45]:
items = pd.read_csv('%s_tracks.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['item_id:token', 'artist:token', 'track:token'])
items.to_csv('data/rb_lfm/rb_lfm.item', sep='\t', index=False)
                    
track_ids = pd.DataFrame(items['item_id:token'])
track_ids['entity_id:token'] = 't' + track_ids['item_id:token'].astype(str)
track_ids.to_csv('data/rb_lfm/rb_lfm.link', sep='\t', index=False)

In [46]:
listening_events = pd.read_csv('%s_listening_events.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['user_id:token', 'item_id:token', 'album_id:token', 'timestamp:token'])
listening_events.to_csv('data/rb_lfm/rb_lfm.inter', sep='\t', index=False)